diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 57d6ea65f857b..03da2cecb5471 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -888,6 +888,7 @@ static struct kobject *scx_root_kobj; #define CREATE_TRACE_POINTS #include +static void process_ddsp_deferred_locals(struct rq *rq); static void scx_bpf_kick_cpu(s32 cpu, u64 flags); static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, s64 exit_code, @@ -1362,6 +1363,67 @@ static int ops_sanitize_err(const char *ops_name, s32 err) return -EPROTO; } +static void run_deferred(struct rq *rq) +{ + process_ddsp_deferred_locals(rq); +} + +#ifdef CONFIG_SMP +static void deferred_bal_cb_workfn(struct rq *rq) +{ + run_deferred(rq); +} +#endif + +static void deferred_irq_workfn(struct irq_work *irq_work) +{ + struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); + + raw_spin_rq_lock(rq); + run_deferred(rq); + raw_spin_rq_unlock(rq); +} + +/** + * schedule_deferred - Schedule execution of deferred actions on an rq + * @rq: target rq + * + * Schedule execution of deferred actions on @rq. Must be called with @rq + * locked. Deferred actions are executed with @rq locked but unpinned, and thus + * can unlock @rq to e.g. migrate tasks to other rqs. + */ +static void schedule_deferred(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + +#ifdef CONFIG_SMP + /* + * If in the middle of waking up a task, task_woken_scx() will be called + * afterwards which will then run the deferred actions, no need to + * schedule anything. + */ + if (rq->scx.flags & SCX_RQ_IN_WAKEUP) + return; + + /* + * If in balance, the balance callbacks will be called before rq lock is + * released. Schedule one. + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) { + queue_balance_callback(rq, &rq->scx.deferred_bal_cb, + deferred_bal_cb_workfn); + return; + } +#endif + /* + * No scheduler hooks available. Queue an irq work. They are executed on + * IRQ re-enable which may take a bit longer than the scheduler hooks. + * The above WAKEUP and BALANCE paths should cover most of the cases and + * the time to IRQ re-enable shouldn't be long. + */ + irq_work_queue(&rq->scx.deferred_irq_work); +} + /** * touch_core_sched - Update timestamp used for core-sched task ordering * @rq: rq to read clock from, must be locked @@ -1577,7 +1639,13 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) bool is_local = dsq == &rq->scx.local_dsq; if (!dsq) { - WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); + /* + * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. + * Unlinking is all that's needed to cancel. + */ + if (unlikely(!list_empty(&p->scx.dsq_list.node))) + list_del_init(&p->scx.dsq_list.node); + /* * When dispatching directly from the BPF scheduler to a local * DSQ, the task isn't associated with any DSQ but @@ -1586,6 +1654,7 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) */ if (p->scx.holding_cpu >= 0) p->scx.holding_cpu = -1; + return; } @@ -1673,17 +1742,6 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, return; } - /* - * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because - * dispatching to the local DSQ of a different CPU requires unlocking - * the current rq which isn't allowed in the enqueue path. Use - * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL. - */ - if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) { - scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch"); - return; - } - WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); WARN_ON_ONCE(p->scx.ddsp_enq_flags); @@ -1693,13 +1751,58 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, static void direct_dispatch(struct task_struct *p, u64 enq_flags) { + struct rq *rq = task_rq(p); struct scx_dispatch_q *dsq; + u64 dsq_id = p->scx.ddsp_dsq_id; + + touch_core_sched_dispatch(rq, p); + + p->scx.ddsp_enq_flags |= enq_flags; + + /* + * We are in the enqueue path with @rq locked and pinned, and thus can't + * double lock a remote rq and enqueue to its local DSQ. For + * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer + * the enqueue so that it's executed when @rq can be unlocked. + */ + if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + unsigned long opss; - touch_core_sched_dispatch(task_rq(p), p); + if (cpu == cpu_of(rq)) { + dsq_id = SCX_DSQ_LOCAL; + goto dispatch; + } + + opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_NONE: + break; + case SCX_OPSS_QUEUEING: + /* + * As @p was never passed to the BPF side, _release is + * not strictly necessary. Still do it for consistency. + */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + break; + default: + WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", + p->comm, p->pid, opss); + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + break; + } - enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); - dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p); - dispatch_enqueue(dsq, p, enq_flags); + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); + list_add_tail(&p->scx.dsq_list.node, + &rq->scx.ddsp_deferred_locals); + schedule_deferred(rq); + return; + } + +dispatch: + dsq = find_dsq_for_dispatch(rq, dsq_id, p); + dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); } static bool scx_rq_online(struct rq *rq) @@ -2601,6 +2704,29 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) } } +static void process_ddsp_deferred_locals(struct rq *rq) +{ + struct task_struct *p, *tmp; + + lockdep_assert_rq_held(rq); + + /* + * Now that @rq can be unlocked, execute the deferred enqueueing of + * tasks directly dispatched to the local DSQs of other CPUs. See + * direct_dispatch(). + */ + list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals, + scx.dsq_list.node) { + s32 ret; + + list_del_init(&p->scx.dsq_list.node); + + ret = dispatch_to_local_dsq(rq, p->scx.ddsp_dsq_id, p, + p->scx.ddsp_enq_flags); + WARN_ON_ONCE(ret == DTL_NOT_LOCAL); + } +} + static void put_prev_task_scx(struct rq *rq, struct task_struct *p) { #ifndef CONFIG_SMP @@ -3022,6 +3148,11 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag } } +static void task_woken_scx(struct rq *rq, struct task_struct *p) +{ + run_deferred(rq); +} + static void set_cpus_allowed_scx(struct task_struct *p, struct affinity_context *ac) { @@ -3538,8 +3669,6 @@ bool scx_can_stop_tick(struct rq *rq) * * - task_fork/dead: We need fork/dead notifications for all tasks regardless of * their current sched_class. Call them directly from sched core instead. - * - * - task_woken: Unnecessary. */ DEFINE_SCHED_CLASS(ext) = { .enqueue_task = enqueue_task_scx, @@ -3559,6 +3688,7 @@ DEFINE_SCHED_CLASS(ext) = { #ifdef CONFIG_SMP .balance = balance_scx, .select_task_rq = select_task_rq_scx, + .task_woken = task_woken_scx, .set_cpus_allowed = set_cpus_allowed_scx, .rq_online = rq_online_scx, @@ -5263,11 +5393,13 @@ void __init init_sched_ext_class(void) init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); INIT_LIST_HEAD(&rq->scx.runnable_list); + INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); if (cpu_online(cpu)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8a0e8052f6b03..be7be54484c08 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -746,6 +746,7 @@ enum scx_rq_flags { struct scx_rq { struct scx_dispatch_q local_dsq; struct list_head runnable_list; /* runnable tasks on this rq */ + struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */ unsigned long ops_qseq; u64 extra_enq_flags; /* see move_task_to_local_dsq() */ u32 nr_running; @@ -757,6 +758,8 @@ struct scx_rq { cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; unsigned long pnt_seq; + struct balance_callback deferred_bal_cb; + struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; }; #endif /* CONFIG_SCHED_CLASS_EXT */