Skip to content

Commit

Permalink
sched: Normalize tg load contributions against runnable time
Browse files Browse the repository at this point in the history
Entities of equal weight should receive equitable distribution of cpu time.
This is challenging in the case of a task_group's shares as execution may be
occurring on multiple cpus simultaneously.

To handle this we divide up the shares into weights proportionate with the load
on each cfs_rq.  This does not however, account for the fact that the sum of
the parts may be less than one cpu and so we need to normalize:
  load(tg) = min(runnable_avg(tg), 1) * tg->shares
Where runnable_avg is the aggregate time in which the task_group had runnable
children.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Ben Segall <bsegall@google.com>.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120823141506.930124292@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Paul Turner authored and Ingo Molnar committed Oct 24, 2012
1 parent 8165e14 commit bb17f65
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 0 deletions.
4 changes: 4 additions & 0 deletions kernel/sched/debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
atomic64_read(&cfs_rq->tg->load_avg));
SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
cfs_rq->tg_runnable_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
atomic_read(&cfs_rq->tg->runnable_avg));
#endif

print_cfs_group_stats(m, cpu, cfs_rq->tg);
Expand Down
56 changes: 56 additions & 0 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -1118,19 +1118,73 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
}
}

/*
* Aggregate cfs_rq runnable averages into an equivalent task_group
* representation for computing load contributions.
*/
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
long contrib;

/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
sa->runnable_avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;

if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
atomic_add(contrib, &tg->runnable_avg);
cfs_rq->tg_runnable_contrib += contrib;
}
}

static inline void __update_group_entity_contrib(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg = cfs_rq->tg;
int runnable_avg;

u64 contrib;

contrib = cfs_rq->tg_load_contrib * tg->shares;
se->avg.load_avg_contrib = div64_u64(contrib,
atomic64_read(&tg->load_avg) + 1);

/*
* For group entities we need to compute a correction term in the case
* that they are consuming <1 cpu so that we would contribute the same
* load as a task of equal weight.
*
* Explicitly co-ordinating this measurement would be expensive, but
* fortunately the sum of each cpus contribution forms a usable
* lower-bound on the true value.
*
* Consider the aggregate of 2 contributions. Either they are disjoint
* (and the sum represents true value) or they are disjoint and we are
* understating by the aggregate of their overlap.
*
* Extending this to N cpus, for a given overlap, the maximum amount we
* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
* cpus that overlap for this interval and w_i is the interval width.
*
* On a small machine; the first term is well-bounded which bounds the
* total error since w_i is a subset of the period. Whereas on a
* larger machine, while this first term can be larger, if w_i is the
* of consequential size guaranteed to see n_i*w_i quickly converge to
* our upper bound of 1-cpu.
*/
runnable_avg = atomic_read(&tg->runnable_avg);
if (runnable_avg < NICE_0_LOAD) {
se->avg.load_avg_contrib *= runnable_avg;
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
}
}
#else
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
int force_update) {}
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq) {}
static inline void __update_group_entity_contrib(struct sched_entity *se) {}
#endif

Expand All @@ -1152,6 +1206,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
if (entity_is_task(se)) {
__update_task_entity_contrib(se);
} else {
__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
__update_group_entity_contrib(se);
}

Expand Down Expand Up @@ -1220,6 +1275,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
}

/* Add the load generated by se into cfs_rq's child load-average */
Expand Down
2 changes: 2 additions & 0 deletions kernel/sched/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ struct task_group {

atomic_t load_weight;
atomic64_t load_avg;
atomic_t runnable_avg;
#endif

#ifdef CONFIG_RT_GROUP_SCHED
Expand Down Expand Up @@ -234,6 +235,7 @@ struct cfs_rq {
atomic64_t decay_counter, removed_load;
u64 last_decay;
#ifdef CONFIG_FAIR_GROUP_SCHED
u32 tg_runnable_contrib;
u64 tg_load_contrib;
#endif
#endif
Expand Down

0 comments on commit bb17f65

Please sign in to comment.