Skip to content

Commit

Permalink
Merge tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/…
Browse files Browse the repository at this point in the history
…linux/kernel/git/tip/tip

Pull scheduler updates from Borislav Petkov:
 "Mostly minor things this time; some highlights:

   - core-sched: Add 'Forced Idle' accounting; this allows to track how
     much CPU time is 'lost' due to core scheduling constraints.

   - psi: Fix for MEM_FULL; a task running reclaim would be counted as a
     runnable task and prevent MEM_FULL from being reported.

   - cpuacct: Long standing fixes for some cgroup accounting issues.

   - rt: Bandwidth timer could, under unusual circumstances, be failed
     to armed, leading to indefinite throttling."

[ Description above by Peter Zijlstra ]

* tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs()
  sched/fair: Cleanup task_util and capacity type
  sched/rt: Try to restart rt period timer when rt runtime exceeded
  sched/fair: Document the slow path and fast path in select_task_rq_fair
  sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity
  sched/fair: Fix detection of per-CPU kthreads waking a task
  sched/cpuacct: Make user/system times in cpuacct.stat more precise
  sched/cpuacct: Fix user/system in shown cpuacct.usage*
  cpuacct: Convert BUG_ON() to WARN_ON_ONCE()
  cputime, cpuacct: Include guest time in user time in cpuacct.stat
  psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
  sched/core: Forced idle accounting
  psi: Add a missing SPDX license header
  psi: Remove repeated verbose comment
  • Loading branch information
Linus Torvalds committed Jan 12, 2022
2 parents 01367e8 + 82762d2 commit 6ae7143
Show file tree
Hide file tree
Showing 14 changed files with 343 additions and 181 deletions.
1 change: 1 addition & 0 deletions include/linux/psi.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PSI_H
#define _LINUX_PSI_H

Expand Down
14 changes: 13 additions & 1 deletion include/linux/psi_types.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PSI_TYPES_H
#define _LINUX_PSI_TYPES_H

Expand All @@ -21,14 +22,25 @@ enum psi_task_count {
* don't have to special case any state tracking for it.
*/
NR_ONCPU,
NR_PSI_TASK_COUNTS = 4,
/*
* For IO and CPU stalls the presence of running/oncpu tasks
* in the domain means a partial rather than a full stall.
* For memory it's not so simple because of page reclaimers:
* they are running/oncpu while representing a stall. To tell
* whether a domain has productivity left or not, we need to
* distinguish between regular running (i.e. productive)
* threads and memstall ones.
*/
NR_MEMSTALL_RUNNING,
NR_PSI_TASK_COUNTS = 5,
};

/* Task state bitmasks */
#define TSK_IOWAIT (1 << NR_IOWAIT)
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)

/* Resources that workloads could be stalled on */
enum psi_res {
Expand Down
4 changes: 4 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,11 @@ struct sched_statistics {
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;

#ifdef CONFIG_SCHED_CORE
u64 core_forceidle_sum;
#endif
#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned;

struct sched_entity {
Expand Down
84 changes: 63 additions & 21 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;

/* flip prio, so high prio is leftmost */
if (prio_less(b, a, task_rq(a)->core->core_forceidle))
if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;

return false;
Expand Down Expand Up @@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}

void sched_core_dequeue(struct rq *rq, struct task_struct *p)
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;

if (!sched_core_enqueued(p))
return;
if (sched_core_enqueued(p)) {
rb_erase(&p->core_node, &rq->core_tree);
RB_CLEAR_NODE(&p->core_node);
}

rb_erase(&p->core_node, &rq->core_tree);
RB_CLEAR_NODE(&p->core_node);
/*
* Migrating the last task off the cpu, with the cpu in forced idle
* state. Reschedule to create an accounting edge for forced idle,
* and re-examine whether the core is still in forced idle state.
*/
if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
rq->core->core_forceidle_count && rq->curr == rq->idle)
resched_curr(rq);
}

/*
Expand Down Expand Up @@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;

cpu_rq(cpu)->core->core_forceidle_start = 0;

sched_core_unlock(cpu, &flags);

cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
Expand Down Expand Up @@ -364,7 +374,8 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */

static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }

#endif /* CONFIG_SCHED_CORE */

Expand Down Expand Up @@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
sched_core_dequeue(rq, p);
sched_core_dequeue(rq, p, flags);

if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
Expand Down Expand Up @@ -5244,6 +5255,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);

rq_unlock(rq, &rf);

Expand Down Expand Up @@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
Expand Down Expand Up @@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

/* reset state */
rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle) {
if (rq->core->core_forceidle_count) {
if (!core_clock_updated) {
update_rq_clock(rq->core);
core_clock_updated = true;
}
sched_core_account_forceidle(rq);
/* reset after accounting force idle */
rq->core->core_forceidle_start = 0;
rq->core->core_forceidle_count = 0;
rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
rq->core->core_forceidle = false;
}

/*
Expand Down Expand Up @@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);

if (i != cpu)
/*
* Current cpu always has its clock updated on entrance to
* pick_next_task(). If the current cpu is not the core,
* the core may also have been updated above.
*/
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);

p = rq_i->core_pick = pick_task(rq_i);
Expand Down Expand Up @@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

if (p == rq_i->idle) {
if (rq_i->nr_running) {
rq->core->core_forceidle = true;
rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
Expand All @@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
}

if (schedstat_enabled() && rq->core->core_forceidle_count) {
if (cookie)
rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_occupation = occ;
}

rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
Expand Down Expand Up @@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* 1 0 1
* 1 1 0
*/
if (!(fi_before && rq->core->core_forceidle))
task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
if (!(fi_before && rq->core->core_forceidle_count))
task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);

rq_i->core_pick->core_occupation = occ;

Expand Down Expand Up @@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
goto unlock;

/* copy the shared state to the new leader */
core_rq->core_task_seq = rq->core_task_seq;
core_rq->core_pick_seq = rq->core_pick_seq;
core_rq->core_cookie = rq->core_cookie;
core_rq->core_forceidle = rq->core_forceidle;
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
core_rq->core_task_seq = rq->core_task_seq;
core_rq->core_pick_seq = rq->core_pick_seq;
core_rq->core_cookie = rq->core_cookie;
core_rq->core_forceidle_count = rq->core_forceidle_count;
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;

/*
* Accounting edge for forced idle is handled in pick_next_task().
* Don't need another one here, since the hotplug thread shouldn't
* have a cookie.
*/
core_rq->core_forceidle_start = 0;

/* install new leader */
for_each_cpu(t, smt_mask) {
Expand Down Expand Up @@ -7126,7 +7166,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,

unsigned long sched_cpu_util(int cpu, unsigned long max)
{
return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
Expand Down Expand Up @@ -9409,7 +9449,9 @@ void __init sched_init(void)
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
rq->core_forceidle = false;
rq->core_forceidle_count = 0;
rq->core_forceidle_occupation = 0;
rq->core_forceidle_start = 0;

rq->core_cookie = 0UL;
#endif
Expand Down
66 changes: 65 additions & 1 deletion kernel/sched/core_sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,

enqueued = sched_core_enqueued(p);
if (enqueued)
sched_core_dequeue(rq, p);
sched_core_dequeue(rq, p, DEQUEUE_SAVE);

old_cookie = p->core_cookie;
p->core_cookie = cookie;
Expand All @@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* If task is currently running, it may not be compatible anymore after
* the cookie change, so enter the scheduler on its CPU to schedule it
* away.
*
* Note that it is possible that as a result of this cookie change, the
* core has now entered/left forced idle state. Defer accounting to the
* next scheduling edge, rather than always forcing a reschedule here.
*/
if (task_running(rq, p))
resched_curr(rq);
Expand Down Expand Up @@ -232,3 +236,63 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
return err;
}

#ifdef CONFIG_SCHEDSTATS

/* REQUIRES: rq->core's clock recently updated. */
void __sched_core_account_forceidle(struct rq *rq)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
u64 delta, now = rq_clock(rq->core);
struct rq *rq_i;
struct task_struct *p;
int i;

lockdep_assert_rq_held(rq);

WARN_ON_ONCE(!rq->core->core_forceidle_count);

if (rq->core->core_forceidle_start == 0)
return;

delta = now - rq->core->core_forceidle_start;
if (unlikely((s64)delta <= 0))
return;

rq->core->core_forceidle_start = now;

if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
/* can't be forced idle without a running task */
} else if (rq->core->core_forceidle_count > 1 ||
rq->core->core_forceidle_occupation > 1) {
/*
* For larger SMT configurations, we need to scale the charged
* forced idle amount since there can be more than one forced
* idle sibling and more than one running cookied task.
*/
delta *= rq->core->core_forceidle_count;
delta = div_u64(delta, rq->core->core_forceidle_occupation);
}

for_each_cpu(i, smt_mask) {
rq_i = cpu_rq(i);
p = rq_i->core_pick ?: rq_i->curr;

if (!p->core_cookie)
continue;

__schedstat_add(p->stats.core_forceidle_sum, delta);
}
}

void __sched_core_tick(struct rq *rq)
{
if (!rq->core->core_forceidle_count)
return;

if (rq != rq->core)
update_rq_clock(rq->core);

__sched_core_account_forceidle(rq);
}

#endif /* CONFIG_SCHEDSTATS */
Loading

0 comments on commit 6ae7143

Please sign in to comment.