Skip to content

Commit

Permalink
sched: Fix potential near-infinite distribute_cfs_runtime() loop
Browse files Browse the repository at this point in the history
distribute_cfs_runtime() intentionally only hands out enough runtime to
bring each cfs_rq to 1 ns of runtime, expecting the cfs_rqs to then take
the runtime they need only once they actually get to run. However, if
they get to run sufficiently quickly, the period timer is still in
distribute_cfs_runtime() and no runtime is available, causing them to
throttle. Then distribute has to handle them again, and this can go on
until distribute has handed out all of the runtime 1ns at a time, which
takes far too long.

Instead allow access to the same runtime that distribute is handing out,
accepting that corner cases with very low quota may be able to spend the
entire cfs_b->runtime during distribute_cfs_runtime, meaning that the
runtime directly handed out by distribute_cfs_runtime was over quota. In
addition, if a cfs_rq does manage to throttle like this, make sure the
existing distribute_cfs_runtime no longer loops over it again.

Signed-off-by: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140620222120.13814.21652.stgit@sword-of-the-dawn.mtv.corp.google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Ben Segall authored and Ingo Molnar committed Jul 5, 2014
1 parent 541b826 commit c06f04c
Showing 1 changed file with 20 additions and 21 deletions.
41 changes: 20 additions & 21 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -3361,7 +3361,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
/*
* Add to the _head_ of the list, so that an already-started
* distribute_cfs_runtime will not see us
*/
list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
if (!cfs_b->timer_active)
__start_cfs_bandwidth(cfs_b, false);
raw_spin_unlock(&cfs_b->lock);
Expand Down Expand Up @@ -3418,7 +3422,8 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
u64 remaining, u64 expires)
{
struct cfs_rq *cfs_rq;
u64 runtime = remaining;
u64 runtime;
u64 starting_runtime = remaining;

rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
Expand Down Expand Up @@ -3449,7 +3454,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
}
rcu_read_unlock();

return remaining;
return starting_runtime - remaining;
}

/*
Expand Down Expand Up @@ -3495,33 +3500,28 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;

/*
* There are throttled entities so we must first use the new bandwidth
* to unthrottle them before making it generally available. This
* ensures that all existing debts will be paid before a new cfs_rq is
* allowed to run.
*/
runtime = cfs_b->runtime;
runtime_expires = cfs_b->runtime_expires;
cfs_b->runtime = 0;

/*
* This check is repeated as we are holding onto the new bandwidth
* while we unthrottle. This can potentially race with an unthrottled
* group trying to acquire new bandwidth from the global pool.
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
* trying to acquire new bandwidth from the global pool. This can result
* in us over-using our runtime if it is all used during this loop, but
* only by limited amounts in that extreme case.
*/
while (throttled && runtime > 0) {
while (throttled && cfs_b->runtime > 0) {
runtime = cfs_b->runtime;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
raw_spin_lock(&cfs_b->lock);

throttled = !list_empty(&cfs_b->throttled_cfs_rq);

cfs_b->runtime -= min(runtime, cfs_b->runtime);
}

/* return (any) remaining runtime */
cfs_b->runtime = runtime;
/*
* While we are ensured activity in the period following an
* unthrottle, this also covers the case in which the new bandwidth is
Expand Down Expand Up @@ -3632,10 +3632,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
return;
}

if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
cfs_b->runtime = 0;
}

expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);

Expand All @@ -3646,7 +3645,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)

raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
cfs_b->runtime = runtime;
cfs_b->runtime -= min(runtime, cfs_b->runtime);
raw_spin_unlock(&cfs_b->lock);
}

Expand Down

0 comments on commit c06f04c

Please sign in to comment.