Skip to content

Commit

Permalink
workqueue: Unbind kworkers before sending them to exit()
Browse files Browse the repository at this point in the history
It has been reported that isolated CPUs can suffer from interference due to
per-CPU kworkers waking up just to die.

A surge of workqueue activity during initial setup of a latency-sensitive
application (refresh_vm_stats() being one of the culprits) can cause extra
per-CPU kworkers to be spawned. Then, said latency-sensitive task can be
running merrily on an isolated CPU only to be interrupted sometime later by
a kworker marked for death (cf. IDLE_WORKER_TIMEOUT, 5 minutes after last
kworker activity).

Prevent this by affining kworkers to the wq_unbound_cpumask (which doesn't
contain isolated CPUs, cf. HK_TYPE_WQ) before waking them up after marking
them with WORKER_DIE.

Changing the affinity does require a sleepable context, leverage the newly
introduced pool->idle_cull_work to get that.

Remove dying workers from pool->workers and keep track of them in a
separate list. This intentionally prevents for_each_loop_worker() from
iterating over workers that are marked for death.

Rename destroy_worker() to set_working_dying() to better reflect its
effects and relationship with wake_dying_workers().

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
  • Loading branch information
Valentin Schneider authored and Tejun Heo committed Jan 12, 2023
1 parent 9ab03be commit e02b931
Showing 1 changed file with 60 additions and 12 deletions.
72 changes: 60 additions & 12 deletions kernel/workqueue.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ struct worker_pool {

struct worker *manager; /* L: purely informational */
struct list_head workers; /* A: attached workers */
struct list_head dying_workers; /* A: workers about to die */
struct completion *detach_completion; /* all workers detached */

struct ida worker_ida; /* worker IDs for task name */
Expand Down Expand Up @@ -1906,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker)
list_del(&worker->node);
worker->pool = NULL;

if (list_empty(&pool->workers))
if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
detach_completion = pool->detach_completion;
mutex_unlock(&wq_pool_attach_mutex);

Expand Down Expand Up @@ -1995,21 +1996,44 @@ static void rebind_worker(struct worker *worker, struct worker_pool *pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0);
}

static void wake_dying_workers(struct list_head *cull_list)
{
struct worker *worker, *tmp;

list_for_each_entry_safe(worker, tmp, cull_list, entry) {
list_del_init(&worker->entry);
unbind_worker(worker);
/*
* If the worker was somehow already running, then it had to be
* in pool->idle_list when set_worker_dying() happened or we
* wouldn't have gotten here.
*
* Thus, the worker must either have observed the WORKER_DIE
* flag, or have set its state to TASK_IDLE. Either way, the
* below will be observed by the worker and is safe to do
* outside of pool->lock.
*/
wake_up_process(worker->task);
}
}

/**
* destroy_worker - destroy a workqueue worker
* set_worker_dying - Tag a worker for destruction
* @worker: worker to be destroyed
* @list: transfer worker away from its pool->idle_list and into list
*
* Destroy @worker and adjust @pool stats accordingly. The worker should
* be idle.
* Tag @worker for destruction and adjust @pool stats accordingly. The worker
* should be idle.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
static void destroy_worker(struct worker *worker)
static void set_worker_dying(struct worker *worker, struct list_head *list)
{
struct worker_pool *pool = worker->pool;

lockdep_assert_held(&pool->lock);
lockdep_assert_held(&wq_pool_attach_mutex);

/* sanity check frenzy */
if (WARN_ON(worker->current_work) ||
Expand All @@ -2020,9 +2044,10 @@ static void destroy_worker(struct worker *worker)
pool->nr_workers--;
pool->nr_idle--;

list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
wake_up_process(worker->task);

list_move(&worker->entry, list);
list_move(&worker->node, &pool->dying_workers);
}

/**
Expand Down Expand Up @@ -2069,11 +2094,24 @@ static void idle_worker_timeout(struct timer_list *t)
*
* This goes through a pool's idle workers and gets rid of those that have been
* idle for at least IDLE_WORKER_TIMEOUT seconds.
*
* We don't want to disturb isolated CPUs because of a pcpu kworker being
* culled, so this also resets worker affinity. This requires a sleepable
* context, hence the split between timer callback and work item.
*/
static void idle_cull_fn(struct work_struct *work)
{
struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
struct list_head cull_list;

INIT_LIST_HEAD(&cull_list);
/*
* Grabbing wq_pool_attach_mutex here ensures an already-running worker
* cannot proceed beyong worker_detach_from_pool() in its self-destruct
* path. This is required as a previously-preempted worker could run after
* set_worker_dying() has happened but before wake_dying_workers() did.
*/
mutex_lock(&wq_pool_attach_mutex);
raw_spin_lock_irq(&pool->lock);

while (too_many_workers(pool)) {
Expand All @@ -2088,10 +2126,12 @@ static void idle_cull_fn(struct work_struct *work)
break;
}

destroy_worker(worker);
set_worker_dying(worker, &cull_list);
}

raw_spin_unlock_irq(&pool->lock);
wake_dying_workers(&cull_list);
mutex_unlock(&wq_pool_attach_mutex);
}

static void send_mayday(struct work_struct *work)
Expand Down Expand Up @@ -2455,12 +2495,12 @@ static int worker_thread(void *__worker)
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
raw_spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);

set_task_comm(worker->task, "kworker/dying");
ida_free(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
WARN_ON_ONCE(!list_empty(&worker->entry));
kfree(worker);
return 0;
}
Expand Down Expand Up @@ -3534,6 +3574,7 @@ static int init_worker_pool(struct worker_pool *pool)
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);

INIT_LIST_HEAD(&pool->workers);
INIT_LIST_HEAD(&pool->dying_workers);

ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
Expand Down Expand Up @@ -3622,8 +3663,11 @@ static void rcu_free_pool(struct rcu_head *rcu)
static void put_unbound_pool(struct worker_pool *pool)
{
DECLARE_COMPLETION_ONSTACK(detach_completion);
struct list_head cull_list;
struct worker *worker;

INIT_LIST_HEAD(&cull_list);

lockdep_assert_held(&wq_pool_mutex);

if (--pool->refcnt)
Expand Down Expand Up @@ -3656,21 +3700,25 @@ static void put_unbound_pool(struct worker_pool *pool)
rcuwait_wait_event(&manager_wait,
!(pool->flags & POOL_MANAGER_ACTIVE),
TASK_UNINTERRUPTIBLE);

mutex_lock(&wq_pool_attach_mutex);
raw_spin_lock_irq(&pool->lock);
if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
pool->flags |= POOL_MANAGER_ACTIVE;
break;
}
raw_spin_unlock_irq(&pool->lock);
mutex_unlock(&wq_pool_attach_mutex);
}

while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
set_worker_dying(worker, &cull_list);
WARN_ON(pool->nr_workers || pool->nr_idle);
raw_spin_unlock_irq(&pool->lock);

mutex_lock(&wq_pool_attach_mutex);
if (!list_empty(&pool->workers))
wake_dying_workers(&cull_list);

if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
pool->detach_completion = &detach_completion;
mutex_unlock(&wq_pool_attach_mutex);

Expand Down

0 comments on commit e02b931

Please sign in to comment.