Skip to content

Commit

Permalink
sched/fair: Implement delayed dequeue
Browse files Browse the repository at this point in the history
Extend / fix 86bfbb7 ("sched/fair: Add lag based placement") by
noting that lag is fundamentally a temporal measure. It should not be
carried around indefinitely.

OTOH it should also not be instantly discarded, doing so will allow a
task to game the system by purposefully (micro) sleeping at the end of
its time quantum.

Since lag is intimately tied to the virtual time base, a wall-time
based decay is also insufficient, notably competition is required for
any of this to make sense.

Instead, delay the dequeue and keep the 'tasks' on the runqueue,
competing until they are eligible.

Strictly speaking, we only care about keeping them until the 0-lag
point, but that is a difficult proposition, instead carry them around
until they get picked again, and dequeue them at that point.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105030.226163742@infradead.org
  • Loading branch information
Peter Zijlstra committed Aug 17, 2024
1 parent e1459a5 commit 152e11f
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 11 deletions.
1 change: 0 additions & 1 deletion kernel/sched/deadline.c
Original file line number Diff line number Diff line change
Expand Up @@ -2428,7 +2428,6 @@ static struct task_struct *__pick_next_task_dl(struct rq *rq, bool peek)
else
p = dl_se->server_pick_next(dl_se);
if (!p) {
WARN_ON_ONCE(1);
dl_se->dl_yielded = 1;
update_curr_dl_se(rq, dl_se, 0);
goto again;
Expand Down
80 changes: 70 additions & 10 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -5379,19 +5379,38 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);

static void
static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
int action = UPDATE_TG;
update_curr(cfs_rq);

if (flags & DEQUEUE_DELAYED) {
SCHED_WARN_ON(!se->sched_delayed);
} else {
bool sleep = flags & DEQUEUE_SLEEP;

/*
* DELAY_DEQUEUE relies on spurious wakeups, special task
* states must not suffer spurious wakeups, excempt them.
*/
if (flags & DEQUEUE_SPECIAL)
sleep = false;

SCHED_WARN_ON(sleep && se->sched_delayed);

if (sched_feat(DELAY_DEQUEUE) && sleep &&
!entity_eligible(cfs_rq, se)) {
if (cfs_rq->next == se)
cfs_rq->next = NULL;
se->sched_delayed = 1;
return false;
}
}

int action = UPDATE_TG;
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
action |= DO_DETACH;

/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);

/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
Expand Down Expand Up @@ -5428,8 +5447,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);

if (flags & DEQUEUE_DELAYED)
se->sched_delayed = 0;

if (cfs_rq->nr_running == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);

return true;
}

static void
Expand Down Expand Up @@ -5828,11 +5852,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
int flags;

/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
goto done;

dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
/*
* Abuse SPECIAL to avoid delayed dequeue in this instance.
* This avoids teaching dequeue_entities() about throttled
* entities and keeps things relatively simple.
*/
flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
if (se->sched_delayed)
flags |= DEQUEUE_DELAYED;
dequeue_entity(qcfs_rq, se, flags);

if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
Expand Down Expand Up @@ -6918,6 +6952,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
bool was_sched_idle = sched_idle_rq(rq);
int rq_h_nr_running = rq->cfs.h_nr_running;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
int idle_h_nr_running = 0;
int h_nr_running = 0;
Expand All @@ -6931,7 +6966,13 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);

if (!dequeue_entity(cfs_rq, se, flags)) {
if (p && &p->se == se)
return -1;

break;
}

cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
Expand All @@ -6956,6 +6997,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
break;
}
flags |= DEQUEUE_SLEEP;
flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
}

for_each_sched_entity(se) {
Expand Down Expand Up @@ -6985,6 +7027,17 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;

if (p && task_delayed) {
SCHED_WARN_ON(!task_sleep);
SCHED_WARN_ON(p->on_rq != 1);

/* Fix-up what dequeue_task_fair() skipped */
hrtick_update(rq);

/* Fix-up what block_task() skipped. */
__block_task(rq, p);
}

return 1;
}

Expand All @@ -6997,8 +7050,10 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
util_est_dequeue(&rq->cfs, p);

if (dequeue_entities(rq, &p->se, flags) < 0)
if (dequeue_entities(rq, &p->se, flags) < 0) {
util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
return false;
}

util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
hrtick_update(rq);
Expand Down Expand Up @@ -12971,6 +13026,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}

if (!first)
return;

SCHED_WARN_ON(se->sched_delayed);
}

void init_cfs_rq(struct cfs_rq *cfs_rq)
Expand Down
9 changes: 9 additions & 0 deletions kernel/sched/features.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ SCHED_FEAT(NEXT_BUDDY, false)
*/
SCHED_FEAT(CACHE_HOT_BUDDY, true)

/*
* Delay dequeueing tasks until they get selected or woken.
*
* By delaying the dequeue for non-eligible tasks, they remain in the
* competition and can burn off their negative lag. When they get selected
* they'll have positive lag by definition.
*/
SCHED_FEAT(DELAY_DEQUEUE, true)

/*
* Allow wakeup-time preemption of the current task:
*/
Expand Down

0 comments on commit 152e11f

Please sign in to comment.