diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 259a4d5a4e8f4..843f2cdc280b9 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -275,6 +275,7 @@ static struct sysrq_key_op sysrq_showregs_op = { static void sysrq_handle_showstate(int key) { show_state(); + show_workqueue_state(); } static struct sysrq_key_op sysrq_showstate_op = { .handler = sysrq_handle_showstate, diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index f597846ff605c..deee212af8e09 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -454,6 +454,7 @@ extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); +extern void show_workqueue_state(void); /** * queue_work - queue work on a workqueue diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41ff75b478c60..586ad91300b0f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -159,6 +159,7 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ + struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ @@ -230,7 +231,7 @@ struct wq_device; */ struct workqueue_struct { struct list_head pwqs; /* WR: all pwqs of this wq */ - struct list_head list; /* PL: list of all workqueues */ + struct list_head list; /* PR: list of all workqueues */ struct mutex mutex; /* protects this wq */ int work_color; /* WQ: current work color */ @@ -257,6 +258,13 @@ struct workqueue_struct { #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ + /* + * Destruction of workqueue_struct is sched-RCU protected to allow + * walking the workqueues list without grabbing wq_pool_mutex. + * This is used to dump all workqueues from sysrq. + */ + struct rcu_head rcu; + /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ @@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static LIST_HEAD(workqueues); /* PL: list of all workqueues */ +static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* the per-cpu worker pools */ @@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from); +static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #define CREATE_TRACE_POINTS #include @@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker) */ if (!mutex_trylock(&pool->manager_arb)) return false; + pool->manager = worker; maybe_create_worker(pool); + pool->manager = NULL; mutex_unlock(&pool->manager_arb); return true; } @@ -2303,6 +2314,7 @@ static int rescuer_thread(void *__rescuer) struct wq_barrier { struct work_struct work; struct completion done; + struct task_struct *task; /* purely informational */ }; static void wq_barrier_func(struct work_struct *work) @@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); + barr->task = current; /* * If @target is currently being executed, schedule the @@ -2989,624 +3002,319 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -#ifdef CONFIG_SYSFS -/* - * Workqueues with WQ_SYSFS flag set is visible to userland via - * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the - * following attributes. - * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items - * - * Unbound workqueues have the following extra attributes. +/** + * free_workqueue_attrs - free a workqueue_attrs + * @attrs: workqueue_attrs to free * - * id RO int : the associated pool ID - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers + * Undo alloc_workqueue_attrs(). */ -struct wq_device { - struct workqueue_struct *wq; - struct device dev; -}; - -static struct workqueue_struct *dev_to_wq(struct device *dev) +void free_workqueue_attrs(struct workqueue_attrs *attrs) { - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - - return wq_dev->wq; + if (attrs) { + free_cpumask_var(attrs->cpumask); + kfree(attrs); + } } -static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, - char *buf) +/** + * alloc_workqueue_attrs - allocate a workqueue_attrs + * @gfp_mask: allocation mask to use + * + * Allocate a new workqueue_attrs, initialize with default settings and + * return it. + * + * Return: The allocated new workqueue_attr on success. %NULL on failure. + */ +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) { - struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; - return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); + attrs = kzalloc(sizeof(*attrs), gfp_mask); + if (!attrs) + goto fail; + if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + goto fail; + + cpumask_copy(attrs->cpumask, cpu_possible_mask); + return attrs; +fail: + free_workqueue_attrs(attrs); + return NULL; } -static DEVICE_ATTR_RO(per_cpu); -static ssize_t max_active_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from) { - struct workqueue_struct *wq = dev_to_wq(dev); - - return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); + to->nice = from->nice; + cpumask_copy(to->cpumask, from->cpumask); + /* + * Unlike hash and equality test, this function doesn't ignore + * ->no_numa as it is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears ->no_numa after copying. + */ + to->no_numa = from->no_numa; } -static ssize_t max_active_store(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) +/* hash value of the content of @attr */ +static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { - struct workqueue_struct *wq = dev_to_wq(dev); - int val; - - if (sscanf(buf, "%d", &val) != 1 || val <= 0) - return -EINVAL; + u32 hash = 0; - workqueue_set_max_active(wq, val); - return count; + hash = jhash_1word(attrs->nice, hash); + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + return hash; } -static DEVICE_ATTR_RW(max_active); - -static struct attribute *wq_sysfs_attrs[] = { - &dev_attr_per_cpu.attr, - &dev_attr_max_active.attr, - NULL, -}; -ATTRIBUTE_GROUPS(wq_sysfs); -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) +/* content equality test */ +static bool wqattrs_equal(const struct workqueue_attrs *a, + const struct workqueue_attrs *b) { - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - rcu_read_lock_sched(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock_sched(); - - return written; + if (a->nice != b->nice) + return false; + if (!cpumask_equal(a->cpumask, b->cpumask)) + return false; + return true; } -static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, - char *buf) +/** + * init_worker_pool - initialize a newly zalloc'd worker_pool + * @pool: worker_pool to initialize + * + * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * + * Return: 0 on success, -errno on failure. Even on failure, all fields + * inside @pool proper are initialized and put_unbound_pool() can be called + * on @pool safely to release it. + */ +static int init_worker_pool(struct worker_pool *pool) { - struct workqueue_struct *wq = dev_to_wq(dev); - int written; + spin_lock_init(&pool->lock); + pool->id = -1; + pool->cpu = -1; + pool->node = NUMA_NO_NODE; + pool->flags |= POOL_DISASSOCIATED; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + hash_init(pool->busy_hash); - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); - mutex_unlock(&wq->mutex); + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; - return written; -} + setup_timer(&pool->mayday_timer, pool_mayday_timeout, + (unsigned long)pool); -/* prepare workqueue_attrs for sysfs store operations */ -static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) -{ - struct workqueue_attrs *attrs; + mutex_init(&pool->manager_arb); + mutex_init(&pool->attach_mutex); + INIT_LIST_HEAD(&pool->workers); - attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!attrs) - return NULL; + ida_init(&pool->worker_ida); + INIT_HLIST_NODE(&pool->hash_node); + pool->refcnt = 1; - mutex_lock(&wq->mutex); - copy_workqueue_attrs(attrs, wq->unbound_attrs); - mutex_unlock(&wq->mutex); - return attrs; + /* shouldn't fail above this point */ + pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pool->attrs) + return -ENOMEM; + return 0; } -static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static void rcu_free_wq(struct rcu_head *rcu) { - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; + struct workqueue_struct *wq = + container_of(rcu, struct workqueue_struct, rcu); - if (sscanf(buf, "%d", &attrs->nice) == 1 && - attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) - ret = apply_workqueue_attrs(wq, attrs); + if (!(wq->flags & WQ_UNBOUND)) + free_percpu(wq->cpu_pwqs); else - ret = -EINVAL; + free_workqueue_attrs(wq->unbound_attrs); - free_workqueue_attrs(attrs); - return ret ?: count; + kfree(wq->rescuer); + kfree(wq); } -static ssize_t wq_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void rcu_free_pool(struct rcu_head *rcu) { - struct workqueue_struct *wq = dev_to_wq(dev); - int written; + struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%*pb\n", - cpumask_pr_args(wq->unbound_attrs->cpumask)); - mutex_unlock(&wq->mutex); - return written; + ida_destroy(&pool->worker_ida); + free_workqueue_attrs(pool->attrs); + kfree(pool); } -static ssize_t wq_cpumask_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +/** + * put_unbound_pool - put a worker_pool + * @pool: worker_pool to put + * + * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * safe manner. get_unbound_pool() calls this function on its failure path + * and this function should be able to release pools which went through, + * successfully or not, init_worker_pool(). + * + * Should be called with wq_pool_mutex held. + */ +static void put_unbound_pool(struct worker_pool *pool) { - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; + DECLARE_COMPLETION_ONSTACK(detach_completion); + struct worker *worker; - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; + lockdep_assert_held(&wq_pool_mutex); - ret = cpumask_parse(buf, attrs->cpumask); - if (!ret) - ret = apply_workqueue_attrs(wq, attrs); + if (--pool->refcnt) + return; - free_workqueue_attrs(attrs); - return ret ?: count; -} + /* sanity checks */ + if (WARN_ON(!(pool->cpu < 0)) || + WARN_ON(!list_empty(&pool->worklist))) + return; -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); - mutex_unlock(&wq->mutex); - - return written; -} - -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int v, ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; + /* release id and unhash */ + if (pool->id >= 0) + idr_remove(&worker_pool_idr, pool->id); + hash_del(&pool->hash_node); - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; - ret = apply_workqueue_attrs(wq, attrs); - } + /* + * Become the manager and destroy all workers. Grabbing + * manager_arb prevents @pool's workers from blocking on + * attach_mutex. + */ + mutex_lock(&pool->manager_arb); - free_workqueue_attrs(attrs); - return ret ?: count; -} + spin_lock_irq(&pool->lock); + while ((worker = first_idle_worker(pool))) + destroy_worker(worker); + WARN_ON(pool->nr_workers || pool->nr_idle); + spin_unlock_irq(&pool->lock); -static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), - __ATTR(nice, 0644, wq_nice_show, wq_nice_store), - __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), - __ATTR_NULL, -}; + mutex_lock(&pool->attach_mutex); + if (!list_empty(&pool->workers)) + pool->detach_completion = &detach_completion; + mutex_unlock(&pool->attach_mutex); -static struct bus_type wq_subsys = { - .name = "workqueue", - .dev_groups = wq_sysfs_groups, -}; + if (pool->detach_completion) + wait_for_completion(pool->detach_completion); -static int __init wq_sysfs_init(void) -{ - return subsys_virtual_register(&wq_subsys, NULL); -} -core_initcall(wq_sysfs_init); + mutex_unlock(&pool->manager_arb); -static void wq_device_release(struct device *dev) -{ - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + /* shut down the timers */ + del_timer_sync(&pool->idle_timer); + del_timer_sync(&pool->mayday_timer); - kfree(wq_dev); + /* sched-RCU protected to allow dereferences from get_work_pool() */ + call_rcu_sched(&pool->rcu, rcu_free_pool); } /** - * workqueue_sysfs_register - make a workqueue visible in sysfs - * @wq: the workqueue to register + * get_unbound_pool - get a worker_pool with the specified attributes + * @attrs: the attributes of the worker_pool to get * - * Expose @wq in sysfs under /sys/bus/workqueue/devices. - * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set - * which is the preferred method. + * Obtain a worker_pool which has the same attributes as @attrs, bump the + * reference count and return it. If there already is a matching + * worker_pool, it will be used; otherwise, this function attempts to + * create a new one. * - * Workqueue user should use this function directly iff it wants to apply - * workqueue_attrs before making the workqueue visible in sysfs; otherwise, - * apply_workqueue_attrs() may race against userland updating the - * attributes. + * Should be called with wq_pool_mutex held. * - * Return: 0 on success, -errno on failure. + * Return: On success, a worker_pool with the same attributes as @attrs. + * On failure, %NULL. */ -int workqueue_sysfs_register(struct workqueue_struct *wq) +static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { - struct wq_device *wq_dev; - int ret; + u32 hash = wqattrs_hash(attrs); + struct worker_pool *pool; + int node; - /* - * Adjusting max_active or creating new pwqs by applyting - * attributes breaks ordering guarantee. Disallow exposing ordered - * workqueues. - */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) - return -EINVAL; + lockdep_assert_held(&wq_pool_mutex); - wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); - if (!wq_dev) - return -ENOMEM; + /* do we already have a matching pool? */ + hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { + if (wqattrs_equal(pool->attrs, attrs)) { + pool->refcnt++; + return pool; + } + } - wq_dev->wq = wq; - wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; - wq_dev->dev.release = wq_device_release; + /* nope, create a new one */ + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool || init_worker_pool(pool) < 0) + goto fail; + + lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ + copy_workqueue_attrs(pool->attrs, attrs); /* - * unbound_attrs are created separately. Suppress uevent until - * everything is ready. + * no_numa isn't a worker_pool attribute, always clear it. See + * 'struct workqueue_attrs' comments for detail. */ - dev_set_uevent_suppress(&wq_dev->dev, true); - - ret = device_register(&wq_dev->dev); - if (ret) { - kfree(wq_dev); - wq->wq_dev = NULL; - return ret; - } - - if (wq->flags & WQ_UNBOUND) { - struct device_attribute *attr; + pool->attrs->no_numa = false; - for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { - ret = device_create_file(&wq_dev->dev, attr); - if (ret) { - device_unregister(&wq_dev->dev); - wq->wq_dev = NULL; - return ret; + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(pool->attrs->cpumask, + wq_numa_possible_cpumask[node])) { + pool->node = node; + break; } } } - dev_set_uevent_suppress(&wq_dev->dev, false); - kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); - return 0; -} + if (worker_pool_assign_id(pool) < 0) + goto fail; -/** - * workqueue_sysfs_unregister - undo workqueue_sysfs_register() - * @wq: the workqueue to unregister - * - * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. - */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) -{ - struct wq_device *wq_dev = wq->wq_dev; + /* create and start the initial worker */ + if (!create_worker(pool)) + goto fail; - if (!wq->wq_dev) - return; + /* install */ + hash_add(unbound_pool_hash, &pool->hash_node, hash); - wq->wq_dev = NULL; - device_unregister(&wq_dev->dev); + return pool; +fail: + if (pool) + put_unbound_pool(pool); + return NULL; } -#else /* CONFIG_SYSFS */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } -#endif /* CONFIG_SYSFS */ -/** - * free_workqueue_attrs - free a workqueue_attrs - * @attrs: workqueue_attrs to free - * - * Undo alloc_workqueue_attrs(). - */ -void free_workqueue_attrs(struct workqueue_attrs *attrs) +static void rcu_free_pwq(struct rcu_head *rcu) { - if (attrs) { - free_cpumask_var(attrs->cpumask); - kfree(attrs); - } + kmem_cache_free(pwq_cache, + container_of(rcu, struct pool_workqueue, rcu)); } -/** - * alloc_workqueue_attrs - allocate a workqueue_attrs - * @gfp_mask: allocation mask to use - * - * Allocate a new workqueue_attrs, initialize with default settings and - * return it. - * - * Return: The allocated new workqueue_attr on success. %NULL on failure. +/* + * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt + * and needs to be destroyed. */ -struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +static void pwq_unbound_release_workfn(struct work_struct *work) { - struct workqueue_attrs *attrs; + struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, + unbound_release_work); + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + bool is_last; - attrs = kzalloc(sizeof(*attrs), gfp_mask); - if (!attrs) - goto fail; - if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) - goto fail; + if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) + return; - cpumask_copy(attrs->cpumask, cpu_possible_mask); - return attrs; -fail: - free_workqueue_attrs(attrs); - return NULL; -} + mutex_lock(&wq->mutex); + list_del_rcu(&pwq->pwqs_node); + is_last = list_empty(&wq->pwqs); + mutex_unlock(&wq->mutex); + + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + + call_rcu_sched(&pwq->rcu, rcu_free_pwq); -static void copy_workqueue_attrs(struct workqueue_attrs *to, - const struct workqueue_attrs *from) -{ - to->nice = from->nice; - cpumask_copy(to->cpumask, from->cpumask); /* - * Unlike hash and equality test, this function doesn't ignore - * ->no_numa as it is used for both pool and wq attrs. Instead, - * get_unbound_pool() explicitly clears ->no_numa after copying. + * If we're the last pwq going away, @wq is already dead and no one + * is gonna access it anymore. Schedule RCU free. */ - to->no_numa = from->no_numa; -} - -/* hash value of the content of @attr */ -static u32 wqattrs_hash(const struct workqueue_attrs *attrs) -{ - u32 hash = 0; - - hash = jhash_1word(attrs->nice, hash); - hash = jhash(cpumask_bits(attrs->cpumask), - BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); - return hash; -} - -/* content equality test */ -static bool wqattrs_equal(const struct workqueue_attrs *a, - const struct workqueue_attrs *b) -{ - if (a->nice != b->nice) - return false; - if (!cpumask_equal(a->cpumask, b->cpumask)) - return false; - return true; -} - -/** - * init_worker_pool - initialize a newly zalloc'd worker_pool - * @pool: worker_pool to initialize - * - * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. - * - * Return: 0 on success, -errno on failure. Even on failure, all fields - * inside @pool proper are initialized and put_unbound_pool() can be called - * on @pool safely to release it. - */ -static int init_worker_pool(struct worker_pool *pool) -{ - spin_lock_init(&pool->lock); - pool->id = -1; - pool->cpu = -1; - pool->node = NUMA_NO_NODE; - pool->flags |= POOL_DISASSOCIATED; - INIT_LIST_HEAD(&pool->worklist); - INIT_LIST_HEAD(&pool->idle_list); - hash_init(pool->busy_hash); - - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; - - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); - - mutex_init(&pool->manager_arb); - mutex_init(&pool->attach_mutex); - INIT_LIST_HEAD(&pool->workers); - - ida_init(&pool->worker_ida); - INIT_HLIST_NODE(&pool->hash_node); - pool->refcnt = 1; - - /* shouldn't fail above this point */ - pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!pool->attrs) - return -ENOMEM; - return 0; -} - -static void rcu_free_pool(struct rcu_head *rcu) -{ - struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - - ida_destroy(&pool->worker_ida); - free_workqueue_attrs(pool->attrs); - kfree(pool); -} - -/** - * put_unbound_pool - put a worker_pool - * @pool: worker_pool to put - * - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU - * safe manner. get_unbound_pool() calls this function on its failure path - * and this function should be able to release pools which went through, - * successfully or not, init_worker_pool(). - * - * Should be called with wq_pool_mutex held. - */ -static void put_unbound_pool(struct worker_pool *pool) -{ - DECLARE_COMPLETION_ONSTACK(detach_completion); - struct worker *worker; - - lockdep_assert_held(&wq_pool_mutex); - - if (--pool->refcnt) - return; - - /* sanity checks */ - if (WARN_ON(!(pool->cpu < 0)) || - WARN_ON(!list_empty(&pool->worklist))) - return; - - /* release id and unhash */ - if (pool->id >= 0) - idr_remove(&worker_pool_idr, pool->id); - hash_del(&pool->hash_node); - - /* - * Become the manager and destroy all workers. Grabbing - * manager_arb prevents @pool's workers from blocking on - * attach_mutex. - */ - mutex_lock(&pool->manager_arb); - - spin_lock_irq(&pool->lock); - while ((worker = first_idle_worker(pool))) - destroy_worker(worker); - WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); - - mutex_lock(&pool->attach_mutex); - if (!list_empty(&pool->workers)) - pool->detach_completion = &detach_completion; - mutex_unlock(&pool->attach_mutex); - - if (pool->detach_completion) - wait_for_completion(pool->detach_completion); - - mutex_unlock(&pool->manager_arb); - - /* shut down the timers */ - del_timer_sync(&pool->idle_timer); - del_timer_sync(&pool->mayday_timer); - - /* sched-RCU protected to allow dereferences from get_work_pool() */ - call_rcu_sched(&pool->rcu, rcu_free_pool); -} - -/** - * get_unbound_pool - get a worker_pool with the specified attributes - * @attrs: the attributes of the worker_pool to get - * - * Obtain a worker_pool which has the same attributes as @attrs, bump the - * reference count and return it. If there already is a matching - * worker_pool, it will be used; otherwise, this function attempts to - * create a new one. - * - * Should be called with wq_pool_mutex held. - * - * Return: On success, a worker_pool with the same attributes as @attrs. - * On failure, %NULL. - */ -static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) -{ - u32 hash = wqattrs_hash(attrs); - struct worker_pool *pool; - int node; - - lockdep_assert_held(&wq_pool_mutex); - - /* do we already have a matching pool? */ - hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { - if (wqattrs_equal(pool->attrs, attrs)) { - pool->refcnt++; - return pool; - } - } - - /* nope, create a new one */ - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool || init_worker_pool(pool) < 0) - goto fail; - - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ - copy_workqueue_attrs(pool->attrs, attrs); - - /* - * no_numa isn't a worker_pool attribute, always clear it. See - * 'struct workqueue_attrs' comments for detail. - */ - pool->attrs->no_numa = false; - - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(pool->attrs->cpumask, - wq_numa_possible_cpumask[node])) { - pool->node = node; - break; - } - } - } - - if (worker_pool_assign_id(pool) < 0) - goto fail; - - /* create and start the initial worker */ - if (!create_worker(pool)) - goto fail; - - /* install */ - hash_add(unbound_pool_hash, &pool->hash_node, hash); - - return pool; -fail: - if (pool) - put_unbound_pool(pool); - return NULL; -} - -static void rcu_free_pwq(struct rcu_head *rcu) -{ - kmem_cache_free(pwq_cache, - container_of(rcu, struct pool_workqueue, rcu)); -} - -/* - * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt - * and needs to be destroyed. - */ -static void pwq_unbound_release_workfn(struct work_struct *work) -{ - struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, - unbound_release_work); - struct workqueue_struct *wq = pwq->wq; - struct worker_pool *pool = pwq->pool; - bool is_last; - - if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) - return; - - mutex_lock(&wq->mutex); - list_del_rcu(&pwq->pwqs_node); - is_last = list_empty(&wq->pwqs); - mutex_unlock(&wq->mutex); - - mutex_lock(&wq_pool_mutex); - put_unbound_pool(pool); - mutex_unlock(&wq_pool_mutex); - - call_rcu_sched(&pwq->rcu, rcu_free_pwq); - - /* - * If we're the last pwq going away, @wq is already dead and no one - * is gonna access it anymore. Free it. - */ - if (is_last) { - free_workqueue_attrs(wq->unbound_attrs); - kfree(wq); - } + if (is_last) + call_rcu_sched(&wq->rcu, rcu_free_wq); } /** @@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); - list_add(&wq->list, &workqueues); + list_add_tail_rcu(&wq->list, &workqueues); mutex_unlock(&wq_pool_mutex); @@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq) * flushing is complete in case freeze races us. */ mutex_lock(&wq_pool_mutex); - list_del_init(&wq->list); + list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); workqueue_sysfs_unregister(wq); - if (wq->rescuer) { + if (wq->rescuer) kthread_stop(wq->rescuer->task); - kfree(wq->rescuer); - wq->rescuer = NULL; - } if (!(wq->flags & WQ_UNBOUND)) { /* * The base ref is never dropped on per-cpu pwqs. Directly - * free the pwqs and wq. + * schedule RCU free. */ - free_percpu(wq->cpu_pwqs); - kfree(wq); + call_rcu_sched(&wq->rcu, rcu_free_wq); } else { /* * We're the sole accessor of @wq at this point. Directly @@ -4437,13 +4141,173 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) } } -/* - * CPU hotplug. - * - * There are two challenges in supporting CPU hotplug. Firstly, there - * are a lot of assumptions on strong associations among work, pwq and - * pool which make migrating pending and scheduled works very - * difficult to implement without impacting hot paths. Secondly, +static void pr_cont_pool_info(struct worker_pool *pool) +{ + pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); + if (pool->node != NUMA_NO_NODE) + pr_cont(" node=%d", pool->node); + pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); +} + +static void pr_cont_work(bool comma, struct work_struct *work) +{ + if (work->func == wq_barrier_func) { + struct wq_barrier *barr; + + barr = container_of(work, struct wq_barrier, work); + + pr_cont("%s BAR(%d)", comma ? "," : "", + task_pid_nr(barr->task)); + } else { + pr_cont("%s %pf", comma ? "," : "", work->func); + } +} + +static void show_pwq(struct pool_workqueue *pwq) +{ + struct worker_pool *pool = pwq->pool; + struct work_struct *work; + struct worker *worker; + bool has_in_flight = false, has_pending = false; + int bkt; + + pr_info(" pwq %d:", pool->id); + pr_cont_pool_info(pool); + + pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); + + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq == pwq) { + has_in_flight = true; + break; + } + } + if (has_in_flight) { + bool comma = false; + + pr_info(" in-flight:"); + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq != pwq) + continue; + + pr_cont("%s %d%s:%pf", comma ? "," : "", + task_pid_nr(worker->task), + worker == pwq->wq->rescuer ? "(RESCUER)" : "", + worker->current_func); + list_for_each_entry(work, &worker->scheduled, entry) + pr_cont_work(false, work); + comma = true; + } + pr_cont("\n"); + } + + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + has_pending = true; + break; + } + } + if (has_pending) { + bool comma = false; + + pr_info(" pending:"); + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) != pwq) + continue; + + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } + + if (!list_empty(&pwq->delayed_works)) { + bool comma = false; + + pr_info(" delayed:"); + list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } +} + +/** + * show_workqueue_state - dump workqueue state + * + * Called from a sysrq handler and prints out all busy workqueues and + * pools. + */ +void show_workqueue_state(void) +{ + struct workqueue_struct *wq; + struct worker_pool *pool; + unsigned long flags; + int pi; + + rcu_read_lock_sched(); + + pr_info("Showing busy workqueues and worker pools:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) { + struct pool_workqueue *pwq; + bool idle = true; + + for_each_pwq(pwq, wq) { + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + idle = false; + break; + } + } + if (idle) + continue; + + pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); + + for_each_pwq(pwq, wq) { + spin_lock_irqsave(&pwq->pool->lock, flags); + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + show_pwq(pwq); + spin_unlock_irqrestore(&pwq->pool->lock, flags); + } + } + + for_each_pool(pool, pi) { + struct worker *worker; + bool first = true; + + spin_lock_irqsave(&pool->lock, flags); + if (pool->nr_workers == pool->nr_idle) + goto next_pool; + + pr_info("pool %d:", pool->id); + pr_cont_pool_info(pool); + pr_cont(" workers=%d", pool->nr_workers); + if (pool->manager) + pr_cont(" manager: %d", + task_pid_nr(pool->manager->task)); + list_for_each_entry(worker, &pool->idle_list, entry) { + pr_cont(" %s%d", first ? "idle: " : "", + task_pid_nr(worker->task)); + first = false; + } + pr_cont("\n"); + next_pool: + spin_unlock_irqrestore(&pool->lock, flags); + } + + rcu_read_unlock_sched(); +} + +/* + * CPU hotplug. + * + * There are two challenges in supporting CPU hotplug. Firstly, there + * are a lot of assumptions on strong associations among work, pwq and + * pool which make migrating pending and scheduled works very + * difficult to implement without impacting hot paths. Secondly, * worker pools serve mix of short, long and very long running works making * blocked draining impractical. * @@ -4637,202 +4501,519 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&pool->attach_mutex); - } + mutex_unlock(&pool->attach_mutex); + } + + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); + + mutex_unlock(&wq_pool_mutex); + break; + } + return NOTIFY_OK; +} + +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + struct work_struct unbind_work; + struct workqueue_struct *wq; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* unbinding per-cpu workers should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); + queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ + flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); + break; + } + return NOTIFY_OK; +} + +#ifdef CONFIG_SMP + +struct work_for_cpu { + struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; +}; + +static void work_for_cpu_fn(struct work_struct *work) +{ + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + + wfc->ret = wfc->fn(wfc->arg); +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * It is up to the caller to ensure that the cpu doesn't go offline. + * The caller must not hold any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. + */ +long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +{ + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; + + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); + destroy_work_on_stack(&wfc.work); + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FREEZER + +/** + * freeze_workqueues_begin - begin freezing workqueues + * + * Start freezing workqueues. After this function returns, all freezable + * workqueues will queue new works to their delayed_works list instead of + * pool->worklist. + * + * CONTEXT: + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + */ +void freeze_workqueues_begin(void) +{ + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + + mutex_lock(&wq_pool_mutex); + + WARN_ON_ONCE(workqueue_freezing); + workqueue_freezing = true; + + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); + } + + mutex_unlock(&wq_pool_mutex); +} + +/** + * freeze_workqueues_busy - are freezable workqueues still busy? + * + * Check whether freezing is complete. This function must be called + * between freeze_workqueues_begin() and thaw_workqueues(). + * + * CONTEXT: + * Grabs and releases wq_pool_mutex. + * + * Return: + * %true if some freezable workqueues are still busy. %false if freezing + * is complete. + */ +bool freeze_workqueues_busy(void) +{ + bool busy = false; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + + mutex_lock(&wq_pool_mutex); + + WARN_ON_ONCE(!workqueue_freezing); + + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; + /* + * nr_active is monotonically decreasing. It's safe + * to peek without lock. + */ + rcu_read_lock_sched(); + for_each_pwq(pwq, wq) { + WARN_ON_ONCE(pwq->nr_active < 0); + if (pwq->nr_active) { + busy = true; + rcu_read_unlock_sched(); + goto out_unlock; + } + } + rcu_read_unlock_sched(); + } +out_unlock: + mutex_unlock(&wq_pool_mutex); + return busy; +} + +/** + * thaw_workqueues - thaw workqueues + * + * Thaw workqueues. Normal queueing is restored and all collected + * frozen works are transferred to their respective pool worklists. + * + * CONTEXT: + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + */ +void thaw_workqueues(void) +{ + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + + mutex_lock(&wq_pool_mutex); + + if (!workqueue_freezing) + goto out_unlock; + + workqueue_freezing = false; + + /* restore max_active and repopulate worklist */ + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); + } + +out_unlock: + mutex_unlock(&wq_pool_mutex); +} +#endif /* CONFIG_FREEZER */ + +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} +static DEVICE_ATTR_RO(per_cpu); + +static ssize_t max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t max_active_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} +static DEVICE_ATTR_RW(max_active); + +static struct attribute *wq_sysfs_attrs[] = { + &dev_attr_per_cpu.attr, + &dev_attr_max_active.attr, + NULL, +}; +ATTRIBUTE_GROUPS(wq_sysfs); + +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + const char *delim = ""; + int node, written = 0; + + rcu_read_lock_sched(); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); + mutex_unlock(&wq->mutex); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + mutex_lock(&wq->mutex); + copy_workqueue_attrs(attrs, wq->unbound_attrs); + mutex_unlock(&wq->mutex); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", + cpumask_pr_args(wq->unbound_attrs->cpumask)); + mutex_unlock(&wq->mutex); + return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; - /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); - mutex_unlock(&wq_pool_mutex); - break; - } - return NOTIFY_OK; + return written; } -/* - * Workqueues should be brought down after normal priority CPU notifiers. - * This will be registered as low priority CPU notifier. - */ -static int workqueue_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { - int cpu = (unsigned long)hcpu; - struct work_struct unbind_work; - struct workqueue_struct *wq; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; - /* update NUMA affinity of unbound workqueues */ - mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); - mutex_unlock(&wq_pool_mutex); + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); - break; + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); } - return NOTIFY_OK; + + free_workqueue_attrs(attrs); + return ret ?: count; } -#ifdef CONFIG_SMP +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR_NULL, +}; -struct work_for_cpu { - struct work_struct work; - long (*fn)(void *); - void *arg; - long ret; +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_groups = wq_sysfs_groups, }; -static void work_for_cpu_fn(struct work_struct *work) +static int __init wq_sysfs_init(void) { - struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); - - wfc->ret = wfc->fn(wfc->arg); + return subsys_virtual_register(&wq_subsys, NULL); } +core_initcall(wq_sysfs_init); -/** - * work_on_cpu - run a function in user context on a particular cpu - * @cpu: the cpu to run on - * @fn: the function to run - * @arg: the function arg - * - * It is up to the caller to ensure that the cpu doesn't go offline. - * The caller must not hold any locks which would prevent @fn from completing. - * - * Return: The value @fn returns. - */ -long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +static void wq_device_release(struct device *dev) { - struct work_for_cpu wfc = { .fn = fn, .arg = arg }; + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); - schedule_work_on(cpu, &wfc.work); - flush_work(&wfc.work); - destroy_work_on_stack(&wfc.work); - return wfc.ret; + kfree(wq_dev); } -EXPORT_SYMBOL_GPL(work_on_cpu); -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_FREEZER /** - * freeze_workqueues_begin - begin freezing workqueues + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register * - * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their delayed_works list instead of - * pool->worklist. + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. * - * CONTEXT: - * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Return: 0 on success, -errno on failure. */ -void freeze_workqueues_begin(void) +int workqueue_sysfs_register(struct workqueue_struct *wq) { - struct workqueue_struct *wq; - struct pool_workqueue *pwq; - - mutex_lock(&wq_pool_mutex); + struct wq_device *wq_dev; + int ret; - WARN_ON_ONCE(workqueue_freezing); - workqueue_freezing = true; + /* + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. + */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; - list_for_each_entry(wq, &workqueues, list) { - mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); - mutex_unlock(&wq->mutex); - } + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; - mutex_unlock(&wq_pool_mutex); -} + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; -/** - * freeze_workqueues_busy - are freezable workqueues still busy? - * - * Check whether freezing is complete. This function must be called - * between freeze_workqueues_begin() and thaw_workqueues(). - * - * CONTEXT: - * Grabs and releases wq_pool_mutex. - * - * Return: - * %true if some freezable workqueues are still busy. %false if freezing - * is complete. - */ -bool freeze_workqueues_busy(void) -{ - bool busy = false; - struct workqueue_struct *wq; - struct pool_workqueue *pwq; + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); - mutex_lock(&wq_pool_mutex); + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } - WARN_ON_ONCE(!workqueue_freezing); + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; - list_for_each_entry(wq, &workqueues, list) { - if (!(wq->flags & WQ_FREEZABLE)) - continue; - /* - * nr_active is monotonically decreasing. It's safe - * to peek without lock. - */ - rcu_read_lock_sched(); - for_each_pwq(pwq, wq) { - WARN_ON_ONCE(pwq->nr_active < 0); - if (pwq->nr_active) { - busy = true; - rcu_read_unlock_sched(); - goto out_unlock; + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; } } - rcu_read_unlock_sched(); } -out_unlock: - mutex_unlock(&wq_pool_mutex); - return busy; + + dev_set_uevent_suppress(&wq_dev->dev, false); + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; } /** - * thaw_workqueues - thaw workqueues - * - * Thaw workqueues. Normal queueing is restored and all collected - * frozen works are transferred to their respective pool worklists. + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister * - * CONTEXT: - * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. */ -void thaw_workqueues(void) +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { - struct workqueue_struct *wq; - struct pool_workqueue *pwq; - - mutex_lock(&wq_pool_mutex); - - if (!workqueue_freezing) - goto out_unlock; - - workqueue_freezing = false; + struct wq_device *wq_dev = wq->wq_dev; - /* restore max_active and repopulate worklist */ - list_for_each_entry(wq, &workqueues, list) { - mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); - mutex_unlock(&wq->mutex); - } + if (!wq->wq_dev) + return; -out_unlock: - mutex_unlock(&wq_pool_mutex); + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); } -#endif /* CONFIG_FREEZER */ +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ static void __init wq_numa_init(void) { diff --git a/mm/percpu.c b/mm/percpu.c index 73c97a5f4495c..dfd02484e8de1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1310,7 +1310,7 @@ bool is_kernel_percpu_address(unsigned long addr) * and, from the second one, the backing allocator (currently either vm or * km) provides translation. * - * The addr can be tranlated simply without checking if it falls into the + * The addr can be translated simply without checking if it falls into the * first chunk. But the current code reflects better how percpu allocator * actually works, and the verification can discover both bugs in percpu * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current @@ -1762,7 +1762,7 @@ early_param("percpu_alloc", percpu_alloc_setup); * and other parameters considering needed percpu size, allocation * atom size and distances between CPUs. * - * Groups are always mutliples of atom size and CPUs which are of + * Groups are always multiples of atom size and CPUs which are of * LOCAL_DISTANCE both ways are grouped together and share space for * units in the same group. The returned configuration is guaranteed * to have CPUs on different nodes on different groups and >=75% usage