diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6cfa6e3996cf7..aa453f9202d89 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5113,6 +5113,11 @@ rcupdate.rcu_cpu_stall_timeout to be used (after conversion from seconds to milliseconds). + rcupdate.rcu_exp_stall_task_details= [KNL] + Print stack dumps of any tasks blocking the + current expedited RCU grace period during an + expedited RCU CPU stall warning. + rcupdate.rcu_expedited= [KNL] Use expedited grace-period primitives, for example, synchronize_rcu_expedited() instead diff --git a/drivers/base/core.c b/drivers/base/core.c index a3e14143ec0cf..bb36aca8d1b7a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -181,7 +181,6 @@ void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(fw_devlink_purge_absent_suppliers); -#ifdef CONFIG_SRCU static DEFINE_MUTEX(device_links_lock); DEFINE_STATIC_SRCU(device_links_srcu); @@ -220,47 +219,6 @@ static void device_link_remove_from_lists(struct device_link *link) list_del_rcu(&link->s_node); list_del_rcu(&link->c_node); } -#else /* !CONFIG_SRCU */ -static DECLARE_RWSEM(device_links_lock); - -static inline void device_links_write_lock(void) -{ - down_write(&device_links_lock); -} - -static inline void device_links_write_unlock(void) -{ - up_write(&device_links_lock); -} - -int device_links_read_lock(void) -{ - down_read(&device_links_lock); - return 0; -} - -void device_links_read_unlock(int not_used) -{ - up_read(&device_links_lock); -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -int device_links_read_lock_held(void) -{ - return lockdep_is_held(&device_links_lock); -} -#endif - -static inline void device_link_synchronize_removal(void) -{ -} - -static void device_link_remove_from_lists(struct device_link *link) -{ - list_del(&link->s_node); - list_del(&link->c_node); -} -#endif /* !CONFIG_SRCU */ static bool device_is_ancestor(struct device *dev, struct device *target) { diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 5fdf269a822e5..2bf5123e48279 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only menuconfig DAX tristate "DAX: direct access to differentiated memory" - select SRCU default m if NVDIMM_DAX if DAX diff --git a/drivers/hwtracing/stm/Kconfig b/drivers/hwtracing/stm/Kconfig index aad594fe79cc5..eda6b11d40a1f 100644 --- a/drivers/hwtracing/stm/Kconfig +++ b/drivers/hwtracing/stm/Kconfig @@ -2,7 +2,6 @@ config STM tristate "System Trace Module devices" select CONFIGFS_FS - select SRCU help A System Trace Module (STM) is a device exporting data in System Trace Protocol (STP) format as defined by MIPI STP standards. diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 998a5cfdbc4e9..5f1e2593fad7e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -6,7 +6,6 @@ menuconfig MD bool "Multiple devices driver support (RAID and LVM)" depends on BLOCK - select SRCU help Support multiple physical spindles through a single logical device. Required for RAID and logical volume management. diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 9e63b8c43f3e2..12910338ea1a0 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -334,7 +334,6 @@ config NETCONSOLE_DYNAMIC config NETPOLL def_bool NETCONSOLE - select SRCU config NET_POLL_CONTROLLER def_bool NETPOLL diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 1569d9a3ada0b..b09cdc59bfd02 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -258,7 +258,7 @@ config PCIE_MEDIATEK_GEN3 MediaTek SoCs. config VMD - depends on PCI_MSI && X86_64 && SRCU && !UML + depends on PCI_MSI && X86_64 && !UML tristate "Intel Volume Management Device Driver" help Adds support for the Intel Volume Management Device (VMD). VMD is a diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 183e5c4aed348..37b6bab90c835 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -17,7 +17,6 @@ config BTRFS_FS select FS_IOMAP select RAID6_PQ select XOR_BLOCKS - select SRCU depends on PAGE_SIZE_LESS_THAN_256KB help diff --git a/fs/locks.c b/fs/locks.c index 8f01bee177159..1909a9de242c8 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1889,7 +1889,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(generic_setlease); -#if IS_ENABLED(CONFIG_SRCU) /* * Kernel subsystems can register to be notified on any attempt to set * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd @@ -1923,30 +1922,6 @@ void lease_unregister_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(lease_unregister_notifier); -#else /* !IS_ENABLED(CONFIG_SRCU) */ -static inline void -lease_notifier_chain_init(void) -{ -} - -static inline void -setlease_notifier(long arg, struct file_lock *lease) -{ -} - -int lease_register_notifier(struct notifier_block *nb) -{ - return 0; -} -EXPORT_SYMBOL_GPL(lease_register_notifier); - -void lease_unregister_notifier(struct notifier_block *nb) -{ -} -EXPORT_SYMBOL_GPL(lease_unregister_notifier); - -#endif /* IS_ENABLED(CONFIG_SRCU) */ - /** * vfs_setlease - sets a lease on an open file * @filp: file pointer diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index c020d26ba223e..c6c72c90fd253 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config FSNOTIFY def_bool n - select SRCU source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index b59cd172b5f97..d5a85a8062d05 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -6,7 +6,6 @@ config QUOTA bool "Quota support" select QUOTACTL - select SRCU help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index d8afdb8784c1c..ba4c00dd8005a 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -139,7 +139,7 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, if (last) { n->next = last->next; n->pprev = &last->next; - rcu_assign_pointer(hlist_next_rcu(last), n); + rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 03abf883a281b..094321c17e48a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -238,6 +238,7 @@ void synchronize_rcu_tasks_rude(void); #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); +void exit_tasks_rcu_stop(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_classic_qs(t, preempt) do { } while (0) @@ -246,6 +247,7 @@ void exit_tasks_rcu_finish(void); #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } +static inline void exit_tasks_rcu_stop(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ @@ -374,11 +376,18 @@ static inline int debug_lockdep_rcu_enabled(void) * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * @c: condition to check * @s: informative message + * + * This checks debug_lockdep_rcu_enabled() before checking (c) to + * prevent early boot splats due to lockdep not yet being initialized, + * and rechecks it after checking (c) to prevent false-positive splats + * due to races with lockdep being disabled. See commit 3066820034b5dd + * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail. */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data.unlikely") __warned; \ - if ((c) && debug_lockdep_rcu_enabled() && !__warned) { \ + if (debug_lockdep_rcu_enabled() && (c) && \ + debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ @@ -1004,6 +1013,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) +#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) +#define kfree_rcu_mightsleep(ptr) kvfree_rcu_mightsleep(ptr) + #define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ @@ -1011,8 +1023,7 @@ do { \ \ if (___p) { \ BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ - kvfree_call_rcu(&((___p)->rhf), (rcu_callback_t)(unsigned long) \ - (offsetof(typeof(*(ptr)), rhf))); \ + kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ } \ } while (0) @@ -1021,7 +1032,7 @@ do { \ typeof(ptr) ___p = (ptr); \ \ if (___p) \ - kvfree_call_rcu(NULL, (rcu_callback_t) (___p)); \ + kvfree_call_rcu(NULL, (void *) (___p)); \ } while (0) /* diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 68f9070aa1110..7f17acf29dda7 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -98,25 +98,25 @@ static inline void synchronize_rcu_expedited(void) */ extern void kvfree(const void *addr); -static inline void __kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) { if (head) { - call_rcu(head, func); + call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); return; } // kvfree_rcu(one_arg) call. might_sleep(); synchronize_rcu(); - kvfree((void *) func); + kvfree(ptr); } #ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +void kvfree_call_rcu(struct rcu_head *head, void *ptr); #else -static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static inline void kvfree_call_rcu(struct rcu_head *head, void *ptr) { - __kvfree_call_rcu(head, func); + __kvfree_call_rcu(head, ptr); } #endif diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 4003bf6cfa1c2..56bccb5a8fdea 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -33,7 +33,7 @@ static inline void rcu_virt_note_context_switch(void) } void synchronize_rcu_expedited(void); -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +void kvfree_call_rcu(struct rcu_head *head, void *ptr); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 9b9d0bbf1d3cf..74796cd7e7a9d 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -214,6 +214,34 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) return retval; } +/** + * srcu_down_read - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter a semaphore-like SRCU read-side critical section. Note that + * SRCU read-side critical sections may be nested. However, it is + * illegal to call anything that waits on an SRCU grace period for the + * same srcu_struct, whether directly or indirectly. Please note that + * one way to indirectly wait on an SRCU grace period is to acquire + * a mutex that is held elsewhere while calling synchronize_srcu() or + * synchronize_srcu_expedited(). But if you want lockdep to help you + * keep this stuff straight, you should instead use srcu_read_lock(). + * + * The semaphore-like nature of srcu_down_read() means that the matching + * srcu_up_read() can be invoked from some other context, for example, + * from some other task or from an irq handler. However, neither + * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler. + * + * Calls to srcu_down_read() may be nested, similar to the manner in + * which calls to down_read() may be nested. + */ +static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) +{ + WARN_ON_ONCE(in_nmi()); + srcu_check_nmi_safety(ssp, false); + return __srcu_read_lock(ssp); +} + /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. @@ -254,6 +282,23 @@ srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) __srcu_read_unlock(ssp, idx); } +/** + * srcu_up_read - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock(). + * + * Exit an SRCU read-side critical section, but not necessarily from + * the same context as the maching srcu_down_read(). + */ +static inline void srcu_up_read(struct srcu_struct *ssp, int idx) + __releases(ssp) +{ + WARN_ON_ONCE(idx & ~0x1); + WARN_ON_ONCE(in_nmi()); + srcu_check_nmi_safety(ssp, false); + __srcu_read_unlock(ssp, idx); +} + /** * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock * diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index c689a81752c9a..558057b517b74 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -49,7 +49,7 @@ struct srcu_data { struct srcu_node { spinlock_t __private lock; unsigned long srcu_have_cbs[4]; /* GP seq for children having CBs, but only */ - /* if greater than ->srcu_gq_seq. */ + /* if greater than ->srcu_gp_seq. */ unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs have CBs for given GP? */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ struct srcu_node *srcu_parent; /* Next up in tree. */ diff --git a/init/Kconfig b/init/Kconfig index 7e5c3ddc341de..af511c726d695 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1865,7 +1865,6 @@ config PERF_EVENTS default y if PROFILING depends on HAVE_PERF_EVENTS select IRQ_WORK - select SRCU help Enable kernel support for various performance events provided by software and hardware. diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9c2fb613a55d2..f04b1978899dd 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -46,6 +46,9 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); +torture_param(int, rt_boost, 2, + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); +torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); @@ -127,15 +130,50 @@ static void torture_lock_busted_write_unlock(int tid __maybe_unused) /* BUGGY, do not use in real life!!! */ } -static void torture_boost_dummy(struct torture_random_state *trsp) +static void __torture_rt_boost(struct torture_random_state *trsp) { - /* Only rtmutexes care about priority */ + const unsigned int factor = rt_boost_factor; + + if (!rt_task(current)) { + /* + * Boost priority once every rt_boost_factor operations. When + * the task tries to take the lock, the rtmutex it will account + * for the new priority, and do any corresponding pi-dance. + */ + if (trsp && !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { + sched_set_fifo(current); + } else /* common case, do nothing */ + return; + } else { + /* + * The task will remain boosted for another 10 * rt_boost_factor + * operations, then restored back to its original prio, and so + * forth. + * + * When @trsp is nil, we want to force-reset the task for + * stopping the kthread. + */ + if (!trsp || !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor * 2))) { + sched_set_normal(current, 0); + } else /* common case, do nothing */ + return; + } +} + +static void torture_rt_boost(struct torture_random_state *trsp) +{ + if (rt_boost != 2) + return; + + __torture_rt_boost(trsp); } static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_lock_busted_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -179,7 +217,7 @@ __releases(torture_spinlock) static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_spin_lock_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -206,7 +244,7 @@ __releases(torture_spinlock) static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_lock_spin_write_unlock_irq, .readlock = NULL, .read_delay = NULL, @@ -275,7 +313,7 @@ __releases(torture_rwlock) static struct lock_torture_ops rw_lock_ops = { .writelock = torture_rwlock_write_lock, .write_delay = torture_rwlock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_rwlock_write_unlock, .readlock = torture_rwlock_read_lock, .read_delay = torture_rwlock_read_delay, @@ -318,7 +356,7 @@ __releases(torture_rwlock) static struct lock_torture_ops rw_lock_irq_ops = { .writelock = torture_rwlock_write_lock_irq, .write_delay = torture_rwlock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_rwlock_write_unlock_irq, .readlock = torture_rwlock_read_lock_irq, .read_delay = torture_rwlock_read_delay, @@ -358,7 +396,7 @@ __releases(torture_mutex) static struct lock_torture_ops mutex_lock_ops = { .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_mutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -456,7 +494,7 @@ static struct lock_torture_ops ww_mutex_lock_ops = { .exit = torture_ww_mutex_exit, .writelock = torture_ww_mutex_lock, .write_delay = torture_mutex_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_ww_mutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -474,37 +512,6 @@ __acquires(torture_rtmutex) return 0; } -static void torture_rtmutex_boost(struct torture_random_state *trsp) -{ - const unsigned int factor = 50000; /* yes, quite arbitrary */ - - if (!rt_task(current)) { - /* - * Boost priority once every ~50k operations. When the - * task tries to take the lock, the rtmutex it will account - * for the new priority, and do any corresponding pi-dance. - */ - if (trsp && !(torture_random(trsp) % - (cxt.nrealwriters_stress * factor))) { - sched_set_fifo(current); - } else /* common case, do nothing */ - return; - } else { - /* - * The task will remain boosted for another ~500k operations, - * then restored back to its original prio, and so forth. - * - * When @trsp is nil, we want to force-reset the task for - * stopping the kthread. - */ - if (!trsp || !(torture_random(trsp) % - (cxt.nrealwriters_stress * factor * 2))) { - sched_set_normal(current, 0); - } else /* common case, do nothing */ - return; - } -} - static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; @@ -530,10 +537,18 @@ __releases(torture_rtmutex) rt_mutex_unlock(&torture_rtmutex); } +static void torture_rt_boost_rtmutex(struct torture_random_state *trsp) +{ + if (!rt_boost) + return; + + __torture_rt_boost(trsp); +} + static struct lock_torture_ops rtmutex_lock_ops = { .writelock = torture_rtmutex_lock, .write_delay = torture_rtmutex_delay, - .task_boost = torture_rtmutex_boost, + .task_boost = torture_rt_boost_rtmutex, .writeunlock = torture_rtmutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -600,7 +615,7 @@ __releases(torture_rwsem) static struct lock_torture_ops rwsem_lock_ops = { .writelock = torture_rwsem_down_write, .write_delay = torture_rwsem_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_rwsem_up_write, .readlock = torture_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -652,7 +667,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { .exit = torture_percpu_rwsem_exit, .writelock = torture_percpu_rwsem_down_write, .write_delay = torture_rwsem_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_percpu_rwsem_up_write, .readlock = torture_percpu_rwsem_down_read, .read_delay = torture_rwsem_read_delay, diff --git a/kernel/notifier.c b/kernel/notifier.c index ab75637fd904f..d353e4b5402d7 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -456,7 +456,6 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh, } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); -#ifdef CONFIG_SRCU /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). @@ -573,8 +572,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); -#endif /* CONFIG_SRCU */ - static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f4f8cb0435b45..fc21c5d5fd5de 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -244,7 +244,24 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->pid_allocated == init_pids) break; + /* + * Release tasks_rcu_exit_srcu to avoid following deadlock: + * + * 1) TASK A unshare(CLONE_NEWPID) + * 2) TASK A fork() twice -> TASK B (child reaper for new ns) + * and TASK C + * 3) TASK B exits, kills TASK C, waits for TASK A to reap it + * 4) TASK A calls synchronize_rcu_tasks() + * -> synchronize_srcu(tasks_rcu_exit_srcu) + * 5) *DEADLOCK* + * + * It is considered safe to release tasks_rcu_exit_srcu here + * because we assume the current task can not be concurrently + * reaped at this point. + */ + exit_tasks_rcu_stop(); schedule(); + exit_tasks_rcu_start(); } __set_current_state(TASK_RUNNING); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index c5aa934de59b0..95a0038c92187 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -224,6 +224,7 @@ extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; extern int rcu_exp_cpu_stall_timeout; +extern bool rcu_exp_stall_task_details __read_mostly; int rcu_jiffies_till_stall_check(void); int rcu_exp_jiffies_till_stall_check(void); @@ -447,14 +448,20 @@ do { \ /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ static inline bool rcu_gp_is_normal(void) { return true; } static inline bool rcu_gp_is_expedited(void) { return false; } +static inline bool rcu_async_should_hurry(void) { return false; } static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } +static inline void rcu_async_hurry(void) { } +static inline void rcu_async_relax(void) { } static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ +bool rcu_async_should_hurry(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); +void rcu_async_hurry(void); +void rcu_async_relax(void); void rcupdate_announce_bootup_oddness(void); #ifdef CONFIG_TASKS_RCU_GENERIC void show_rcu_tasks_gp_kthreads(void); diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index c54ea2b6a36bc..f71fac422c8f6 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -89,7 +89,7 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) } /* Get the length of a segment of the rcu_segcblist structure. */ -static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) +long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) { return READ_ONCE(rsclp->seglen[seg]); } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 431cee212467d..4fe877f5f6540 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,6 +15,8 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } +long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg); + /* Return number of callbacks in segmented callback list by summing seglen. */ long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 634df26a2c27c..8e6c023212cb3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -399,7 +399,7 @@ static int torture_readlock_not_held(void) return rcu_read_lock_bh_held() || rcu_read_lock_sched_held(); } -static int rcu_torture_read_lock(void) __acquires(RCU) +static int rcu_torture_read_lock(void) { rcu_read_lock(); return 0; @@ -441,7 +441,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) } } -static void rcu_torture_read_unlock(int idx) __releases(RCU) +static void rcu_torture_read_unlock(int idx) { rcu_read_unlock(); } @@ -625,7 +625,7 @@ static struct srcu_struct srcu_ctld; static struct srcu_struct *srcu_ctlp = &srcu_ctl; static struct rcu_torture_ops srcud_ops; -static int srcu_torture_read_lock(void) __acquires(srcu_ctlp) +static int srcu_torture_read_lock(void) { if (cur_ops == &srcud_ops) return srcu_read_lock_nmisafe(srcu_ctlp); @@ -652,7 +652,7 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) } } -static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) +static void srcu_torture_read_unlock(int idx) { if (cur_ops == &srcud_ops) srcu_read_unlock_nmisafe(srcu_ctlp, idx); @@ -814,13 +814,13 @@ static void synchronize_rcu_trivial(void) } } -static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +static int rcu_torture_read_lock_trivial(void) { preempt_disable(); return 0; } -static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +static void rcu_torture_read_unlock_trivial(int idx) { preempt_enable(); } diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 435c884c02b5c..afa3e1a2f6902 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -76,6 +76,8 @@ torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); // Wait until there are multiple CPUs before starting test. torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, "Holdoff time before test start (s)"); +// Number of typesafe_lookup structures, that is, the degree of concurrency. +torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures."); // Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. @@ -124,7 +126,7 @@ static int exp_idx; // Operations vector for selecting different types of tests. struct ref_scale_ops { - void (*init)(void); + bool (*init)(void); void (*cleanup)(void); void (*readsection)(const int nloops); void (*delaysection)(const int nloops, const int udl, const int ndl); @@ -162,8 +164,9 @@ static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl } } -static void rcu_sync_scale_init(void) +static bool rcu_sync_scale_init(void) { + return true; } static struct ref_scale_ops rcu_ops = { @@ -315,9 +318,10 @@ static struct ref_scale_ops refcnt_ops = { // Definitions for rwlock static rwlock_t test_rwlock; -static void ref_rwlock_init(void) +static bool ref_rwlock_init(void) { rwlock_init(&test_rwlock); + return true; } static void ref_rwlock_section(const int nloops) @@ -351,9 +355,10 @@ static struct ref_scale_ops rwlock_ops = { // Definitions for rwsem static struct rw_semaphore test_rwsem; -static void ref_rwsem_init(void) +static bool ref_rwsem_init(void) { init_rwsem(&test_rwsem); + return true; } static void ref_rwsem_section(const int nloops) @@ -523,6 +528,237 @@ static struct ref_scale_ops clock_ops = { .name = "clock" }; +//////////////////////////////////////////////////////////////////////// +// +// Methods leveraging SLAB_TYPESAFE_BY_RCU. +// + +// Item to look up in a typesafe manner. Array of pointers to these. +struct refscale_typesafe { + atomic_t rts_refctr; // Used by all flavors + spinlock_t rts_lock; + seqlock_t rts_seqlock; + unsigned int a; + unsigned int b; +}; + +static struct kmem_cache *typesafe_kmem_cachep; +static struct refscale_typesafe **rtsarray; +static long rtsarray_size; +static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand); +static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start); +static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start); + +// Conditionally acquire an explicit in-structure reference count. +static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ + return atomic_inc_not_zero(&rtsp->rts_refctr); +} + +// Unconditionally release an explicit in-structure reference count. +static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start) +{ + if (!atomic_dec_return(&rtsp->rts_refctr)) { + WRITE_ONCE(rtsp->a, rtsp->a + 1); + kmem_cache_free(typesafe_kmem_cachep, rtsp); + } + return true; +} + +// Unconditionally acquire an explicit in-structure spinlock. +static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ + spin_lock(&rtsp->rts_lock); + return true; +} + +// Unconditionally release an explicit in-structure spinlock. +static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start) +{ + spin_unlock(&rtsp->rts_lock); + return true; +} + +// Unconditionally acquire an explicit in-structure sequence lock. +static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ + *start = read_seqbegin(&rtsp->rts_seqlock); + return true; +} + +// Conditionally release an explicit in-structure sequence lock. Return +// true if this release was successful, that is, if no retry is required. +static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start) +{ + return !read_seqretry(&rtsp->rts_seqlock, start); +} + +// Do a read-side critical section with the specified delay in +// microseconds and nanoseconds inserted so as to increase probability +// of failure. +static void typesafe_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned int a; + unsigned int b; + int i; + long idx; + struct refscale_typesafe *rtsp; + unsigned int start; + + for (i = nloops; i >= 0; i--) { + preempt_disable(); + idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size; + preempt_enable(); +retry: + rcu_read_lock(); + rtsp = rcu_dereference(rtsarray[idx]); + a = READ_ONCE(rtsp->a); + if (!rts_acquire(rtsp, &start)) { + rcu_read_unlock(); + goto retry; + } + if (a != READ_ONCE(rtsp->a)) { + (void)rts_release(rtsp, start); + rcu_read_unlock(); + goto retry; + } + un_delay(udl, ndl); + // Remember, seqlock read-side release can fail. + if (!rts_release(rtsp, start)) { + rcu_read_unlock(); + goto retry; + } + b = READ_ONCE(rtsp->a); + WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b); + b = rtsp->b; + rcu_read_unlock(); + WARN_ON_ONCE(a * a != b); + } +} + +// Because the acquisition and release methods are expensive, there +// is no point in optimizing away the un_delay() function's two checks. +// Thus simply define typesafe_read_section() as a simple wrapper around +// typesafe_delay_section(). +static void typesafe_read_section(const int nloops) +{ + typesafe_delay_section(nloops, 0, 0); +} + +// Allocate and initialize one refscale_typesafe structure. +static struct refscale_typesafe *typesafe_alloc_one(void) +{ + struct refscale_typesafe *rtsp; + + rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL); + if (!rtsp) + return NULL; + atomic_set(&rtsp->rts_refctr, 1); + WRITE_ONCE(rtsp->a, rtsp->a + 1); + WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a); + return rtsp; +} + +// Slab-allocator constructor for refscale_typesafe structures created +// out of a new slab of system memory. +static void refscale_typesafe_ctor(void *rtsp_in) +{ + struct refscale_typesafe *rtsp = rtsp_in; + + spin_lock_init(&rtsp->rts_lock); + seqlock_init(&rtsp->rts_seqlock); + preempt_disable(); + rtsp->a = torture_random(this_cpu_ptr(&refscale_rand)); + preempt_enable(); +} + +static struct ref_scale_ops typesafe_ref_ops; +static struct ref_scale_ops typesafe_lock_ops; +static struct ref_scale_ops typesafe_seqlock_ops; + +// Initialize for a typesafe test. +static bool typesafe_init(void) +{ + long idx; + long si = lookup_instances; + + typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe", + sizeof(struct refscale_typesafe), sizeof(void *), + SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor); + if (!typesafe_kmem_cachep) + return false; + if (si < 0) + si = -si * nr_cpu_ids; + else if (si == 0) + si = nr_cpu_ids; + rtsarray_size = si; + rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL); + if (!rtsarray) + return false; + for (idx = 0; idx < rtsarray_size; idx++) { + rtsarray[idx] = typesafe_alloc_one(); + if (!rtsarray[idx]) + return false; + } + if (cur_ops == &typesafe_ref_ops) { + rts_acquire = typesafe_ref_acquire; + rts_release = typesafe_ref_release; + } else if (cur_ops == &typesafe_lock_ops) { + rts_acquire = typesafe_lock_acquire; + rts_release = typesafe_lock_release; + } else if (cur_ops == &typesafe_seqlock_ops) { + rts_acquire = typesafe_seqlock_acquire; + rts_release = typesafe_seqlock_release; + } else { + WARN_ON_ONCE(1); + return false; + } + return true; +} + +// Clean up after a typesafe test. +static void typesafe_cleanup(void) +{ + long idx; + + if (rtsarray) { + for (idx = 0; idx < rtsarray_size; idx++) + kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]); + kfree(rtsarray); + rtsarray = NULL; + rtsarray_size = 0; + } + kmem_cache_destroy(typesafe_kmem_cachep); + typesafe_kmem_cachep = NULL; + rts_acquire = NULL; + rts_release = NULL; +} + +// The typesafe_init() function distinguishes these structures by address. +static struct ref_scale_ops typesafe_ref_ops = { + .init = typesafe_init, + .cleanup = typesafe_cleanup, + .readsection = typesafe_read_section, + .delaysection = typesafe_delay_section, + .name = "typesafe_ref" +}; + +static struct ref_scale_ops typesafe_lock_ops = { + .init = typesafe_init, + .cleanup = typesafe_cleanup, + .readsection = typesafe_read_section, + .delaysection = typesafe_delay_section, + .name = "typesafe_lock" +}; + +static struct ref_scale_ops typesafe_seqlock_ops = { + .init = typesafe_init, + .cleanup = typesafe_cleanup, + .readsection = typesafe_read_section, + .delaysection = typesafe_delay_section, + .name = "typesafe_seqlock" +}; + static void rcu_scale_one_reader(void) { if (readdelay <= 0) @@ -812,6 +1048,7 @@ ref_scale_init(void) static struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, + &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; if (!torture_init_begin(scale_type, verbose)) @@ -833,7 +1070,10 @@ ref_scale_init(void) goto unwind; } if (cur_ops->init) - cur_ops->init(); + if (!cur_ops->init()) { + firsterr = -EUCLEAN; + goto unwind; + } ref_scale_print_module_parms(cur_ops, "Start of test"); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ca4b5dcec675b..ab4ee58af84bf 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -154,7 +154,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) */ static inline bool srcu_invl_snp_seq(unsigned long s) { - return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; + return s == SRCU_SNP_INIT_SEQ; } /* @@ -469,24 +469,59 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) /* * If the locks are the same as the unlocks, then there must have - * been no readers on this index at some time in between. This does - * not mean that there are no more readers, as one could have read - * the current index but not have incremented the lock counter yet. + * been no readers on this index at some point in this function. + * But there might be more readers, as a task might have read + * the current ->srcu_idx but not yet have incremented its CPU's + * ->srcu_lock_count[idx] counter. In fact, it is possible + * that most of the tasks have been preempted between fetching + * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there + * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks + * in a system whose address space was fully populated with memory. + * Call this quantity Nt. * - * So suppose that the updater is preempted here for so long - * that more than ULONG_MAX non-nested readers come and go in - * the meantime. It turns out that this cannot result in overflow - * because if a reader modifies its unlock count after we read it - * above, then that reader's next load of ->srcu_idx is guaranteed - * to get the new value, which will cause it to operate on the - * other bank of counters, where it cannot contribute to the - * overflow of these counters. This means that there is a maximum - * of 2*NR_CPUS increments, which cannot overflow given current - * systems, especially not on 64-bit systems. + * So suppose that the updater is preempted at this point in the + * code for a long time. That now-preempted updater has already + * flipped ->srcu_idx (possibly during the preceding grace period), + * done an smp_mb() (again, possibly during the preceding grace + * period), and summed up the ->srcu_unlock_count[idx] counters. + * How many times can a given one of the aforementioned Nt tasks + * increment the old ->srcu_idx value's ->srcu_lock_count[idx] + * counter, in the absence of nesting? * - * OK, how about nesting? This does impose a limit on nesting - * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, - * especially on 64-bit systems. + * It can clearly do so once, given that it has already fetched + * the old value of ->srcu_idx and is just about to use that value + * to index its increment of ->srcu_lock_count[idx]. But as soon as + * it leaves that SRCU read-side critical section, it will increment + * ->srcu_unlock_count[idx], which must follow the updater's above + * read from that same value. Thus, as soon the reading task does + * an smp_mb() and a later fetch from ->srcu_idx, that task will be + * guaranteed to get the new index. Except that the increment of + * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the + * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() + * is before the smp_mb(). Thus, that task might not see the new + * value of ->srcu_idx until the -second- __srcu_read_lock(), + * which in turn means that this task might well increment + * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, + * not just once. + * + * However, it is important to note that a given smp_mb() takes + * effect not just for the task executing it, but also for any + * later task running on that same CPU. + * + * That is, there can be almost Nt + Nc further increments of + * ->srcu_lock_count[idx] for the old index, where Nc is the number + * of CPUs. But this is OK because the size of the task_struct + * structure limits the value of Nt and current systems limit Nc + * to a few thousand. + * + * OK, but what about nesting? This does impose a limit on + * nesting of half of the size of the task_struct structure + * (measured in bytes), which should be sufficient. A late 2022 + * TREE01 rcutorture run reported this size to be no less than + * 9408 bytes, allowing up to 4704 levels of nesting, which is + * comfortably beyond excessive. Especially on 64-bit systems, + * which are unlikely to be configured with an address space fully + * populated with memory, at least not anytime soon. */ return srcu_readers_lock_idx(ssp, idx) == unlocks; } @@ -726,7 +761,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) int state; if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) - sdp = per_cpu_ptr(ssp->sda, 0); + sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); @@ -837,7 +872,8 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* Initiate callback invocation as needed. */ ss_state = smp_load_acquire(&ssp->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_BARRIER) { - srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), + cbdelay); } else { idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); srcu_for_each_node_breadth_first(ssp, snp) { @@ -914,7 +950,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp if (snp) for (; snp != NULL; snp = snp->srcu_parent) { sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); - if (rcu_seq_done(&ssp->srcu_gp_seq, s) || + if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) || (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; spin_lock_irqsave_rcu_node(snp, flags); @@ -941,6 +977,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp * * Note that this function also does the work of srcu_funnel_exp_start(), * in some cases by directly invoking it. + * + * The srcu read lock should be hold around this function. And s is a seq snap + * after holding that lock. */ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, unsigned long s, bool do_norm) @@ -961,7 +1000,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp_leaf) /* Each pass through the loop does one level of the srcu_node tree. */ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) + if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; @@ -998,8 +1037,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); - /* If grace period not already done and none in progress, start it. */ - if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && + /* If grace period not already in progress, start it. */ + if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); srcu_gp_start(ssp); @@ -1059,10 +1098,11 @@ static void srcu_flip(struct srcu_struct *ssp) /* * Ensure that if the updater misses an __srcu_read_unlock() - * increment, that task's next __srcu_read_lock() will see the - * above counter update. Note that both this memory barrier - * and the one in srcu_readers_active_idx_check() provide the - * guarantee for __srcu_read_lock(). + * increment, that task's __srcu_read_lock() following its next + * __srcu_read_lock() or __srcu_read_unlock() will see the above + * counter update. Note that both this memory barrier and the + * one in srcu_readers_active_idx_check() provide the guarantee + * for __srcu_read_lock(). */ smp_mb(); /* D */ /* Pairs with C. */ } @@ -1161,7 +1201,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, idx = __srcu_read_lock_nmisafe(ssp); ss_state = smp_load_acquire(&ssp->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_CALL) - sdp = per_cpu_ptr(ssp->sda, 0); + sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_sdp_contention(sdp, &flags); @@ -1497,7 +1537,7 @@ void srcu_barrier(struct srcu_struct *ssp) idx = __srcu_read_lock_nmisafe(ssp); if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) - srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id())); else for_each_possible_cpu(cpu) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fe9840d90e960..bfb5e1549f2b2 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -384,6 +384,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) { int cpu; unsigned long flags; + bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq); long n; long ncbs = 0; long ncbsnz = 0; @@ -425,21 +426,23 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); + gpdone = false; pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); } - if (rcu_task_cb_adjust && !ncbsnz && - poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) { + if (rcu_task_cb_adjust && !ncbsnz && gpdone) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) { WRITE_ONCE(rtp->percpu_dequeue_lim, 1); pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); } - for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + if (rtp->percpu_dequeue_lim == 1) { + for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); - WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); + } } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); } @@ -560,8 +563,9 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) { /* Complain if the scheduler has not started. */ - WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); + if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_%s() called too soon", rtp->name)) + return; // If the grace-period kthread is running, use it. if (READ_ONCE(rtp->kthread_ptr)) { @@ -827,11 +831,21 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) static void rcu_tasks_postscan(struct list_head *hop) { /* - * Wait for tasks that are in the process of exiting. This - * does only part of the job, ensuring that all tasks that were - * previously exiting reach the point where they have disabled - * preemption, allowing the later synchronize_rcu() to finish - * the job. + * Exiting tasks may escape the tasklist scan. Those are vulnerable + * until their final schedule() with TASK_DEAD state. To cope with + * this, divide the fragile exit path part in two intersecting + * read side critical sections: + * + * 1) An _SRCU_ read side starting before calling exit_notify(), + * which may remove the task from the tasklist, and ending after + * the final preempt_disable() call in do_exit(). + * + * 2) An _RCU_ read side starting with the final preempt_disable() + * call in do_exit() and ending with the final call to schedule() + * with TASK_DEAD state. + * + * This handles the part 1). And postgp will handle part 2) with a + * call to synchronize_rcu(). */ synchronize_srcu(&tasks_rcu_exit_srcu); } @@ -898,7 +912,10 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) * * In addition, this synchronize_rcu() waits for exiting tasks * to complete their final preempt_disable() region of execution, - * cleaning up after the synchronize_srcu() above. + * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu), + * enforcing the whole region before tasklist removal until + * the final schedule() with TASK_DEAD state to be an RCU TASKS + * read side critical section. */ synchronize_rcu(); } @@ -988,27 +1005,42 @@ void show_rcu_tasks_classic_gp_kthread(void) EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +/* + * Contribute to protect against tasklist scan blind spot while the + * task is exiting and may be removed from the tasklist. See + * corresponding synchronize_srcu() for further details. + */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) { - preempt_disable(); current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); } -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +/* + * Contribute to protect against tasklist scan blind spot while the + * task is exiting and may be removed from the tasklist. See + * corresponding synchronize_srcu() for further details. + */ +void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu) { struct task_struct *t = current; - preempt_disable(); __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); - preempt_enable(); - exit_tasks_rcu_finish_trace(t); +} + +/* + * Contribute to protect against tasklist scan blind spot while the + * task is exiting and may be removed from the tasklist. See + * corresponding synchronize_srcu() for further details. + */ +void exit_tasks_rcu_finish(void) +{ + exit_tasks_rcu_stop(); + exit_tasks_rcu_finish_trace(current); } #else /* #ifdef CONFIG_TASKS_RCU */ void exit_tasks_rcu_start(void) { } +void exit_tasks_rcu_stop(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ @@ -1036,9 +1068,6 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { - if (num_online_cpus() <= 1) - return; // Fastpath for only one CPU. - rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } @@ -1815,23 +1844,21 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) static void rcu_tasks_initiate_self_tests(void) { - unsigned long j = jiffies; - pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU - tests[0].runstart = j; + tests[0].runstart = jiffies; synchronize_rcu_tasks(); call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_RUDE_RCU - tests[1].runstart = j; + tests[1].runstart = jiffies; synchronize_rcu_tasks_rude(); call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU - tests[2].runstart = j; + tests[2].runstart = jiffies; synchronize_rcu_tasks_trace(); call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); #endif diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 72913ce21258b..42f7589e51e09 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -246,15 +246,12 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); #ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, void *ptr) { - if (head) { - void *ptr = (void *) head - (unsigned long) func; - + if (head) kasan_record_aux_stack_noalloc(ptr); - } - __kvfree_call_rcu(head, func); + __kvfree_call_rcu(head, ptr); } EXPORT_SYMBOL_GPL(kvfree_call_rcu); #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cf34a961821ad..739219a78b847 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -144,14 +144,16 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); +static bool rcu_rdp_cpu_online(struct rcu_data *rdp); +static bool rcu_init_invoked(void); +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); /* * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" @@ -214,27 +216,6 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); */ #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */ -/* - * Compute the mask of online CPUs for the specified rcu_node structure. - * This will not be stable unless the rcu_node structure's ->lock is - * held, but the bit corresponding to the current CPU will be stable - * in most contexts. - */ -static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) -{ - return READ_ONCE(rnp->qsmaskinitnext); -} - -/* - * Is the CPU corresponding to the specified rcu_data structure online - * from RCU's perspective? This perspective is given by that structure's - * ->qsmaskinitnext field rather than by the global cpu_online_mask. - */ -static bool rcu_rdp_cpu_online(struct rcu_data *rdp) -{ - return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); -} - /* * Return true if an RCU grace period is in progress. The READ_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -734,46 +715,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t) smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); } -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) - -/* - * Is the current CPU online as far as RCU is concerned? - * - * Disable preemption to avoid false positives that could otherwise - * happen due to the current CPU number being sampled, this task being - * preempted, its old CPU being taken offline, resuming on some other CPU, - * then determining that its old CPU is now offline. - * - * Disable checking if in an NMI handler because we cannot safely - * report errors from NMI handlers anyway. In addition, it is OK to use - * RCU on an offline processor during initial boot, hence the check for - * rcu_scheduler_fully_active. - */ -bool rcu_lockdep_current_cpu_online(void) -{ - struct rcu_data *rdp; - bool ret = false; - - if (in_nmi() || !rcu_scheduler_fully_active) - return true; - preempt_disable_notrace(); - rdp = this_cpu_ptr(&rcu_data); - /* - * Strictly, we care here about the case where the current CPU is - * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask - * not being up to date. So arch_spin_is_locked() might have a - * false positive if it's held by some *other* CPU, but that's - * OK because that just means a false *negative* on the warning. - */ - if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) - ret = true; - preempt_enable_notrace(); - return ret; -} -EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); - -#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ - /* * When trying to report a quiescent state on behalf of some other CPU, * it is our responsibility to check for and handle potential overflow @@ -1350,13 +1291,6 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } -// Has rcu_init() been invoked? This is used (for example) to determine -// whether spinlocks may be acquired safely. -static bool rcu_init_invoked(void) -{ - return !!rcu_state.n_online_cpus; -} - // Make the polled API aware of the beginning of a grace period. static void rcu_poll_gp_seq_start(unsigned long *snap) { @@ -2091,92 +2025,6 @@ rcu_check_quiescent_state(struct rcu_data *rdp) rcu_report_qs_rdp(rdp); } -/* - * Near the end of the offline process. Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ - bool blkd; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); - return 0; -} - -/* - * All CPUs for the specified rcu_node structure have gone offline, - * and all tasks that were preempted within an RCU read-side critical - * section while running on one of those CPUs have since exited their RCU - * read-side critical section. Some other CPU is reporting this fact with - * the specified rcu_node structure's ->lock held and interrupts disabled. - * This function therefore goes up the tree of rcu_node structures, - * clearing the corresponding bits in the ->qsmaskinit fields. Note that - * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated. - * - * This function does check that the specified rcu_node structure has - * all CPUs offline and no blocked tasks, so it is OK to invoke it - * prematurely. That said, invoking it after the fact will cost you - * a needless lock acquisition. So once it has done its work, don't - * invoke it again. - */ -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) -{ - long mask; - struct rcu_node *rnp = rnp_leaf; - - raw_lockdep_assert_held_rcu_node(rnp_leaf); - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - WARN_ON_ONCE(rnp_leaf->qsmaskinit) || - WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) - return; - for (;;) { - mask = rnp->grpmask; - rnp = rnp->parent; - if (!rnp) - break; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rnp->qsmaskinit &= ~mask; - /* Between grace periods, so better already be zero! */ - WARN_ON_ONCE(rnp->qsmask); - if (rnp->qsmaskinit) { - raw_spin_unlock_rcu_node(rnp); - /* irqs remain disabled. */ - return; - } - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - } -} - -/* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); - /* Adjust any no-longer-needed kthreads. */ - rcu_boost_kthread_setaffinity(rnp, -1); - // Stop-machine done, so allow nohz_full to disable tick. - tick_dep_clear(TICK_DEP_BIT_RCU); - return 0; -} - /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Throttle as specified by rdp->blimit. @@ -2209,7 +2057,7 @@ static void rcu_do_batch(struct rcu_data *rdp) */ rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); - pending = rcu_segcblist_n_cbs(&rdp->cblist); + pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL); div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); @@ -2727,10 +2575,11 @@ static void check_cb_ovld(struct rcu_data *rdp) } static void -__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) { static atomic_t doublefrees; unsigned long flags; + bool lazy; struct rcu_data *rdp; bool was_alldone; @@ -2755,6 +2604,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) kasan_record_aux_stack_noalloc(head); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); + lazy = lazy_in && !rcu_async_should_hurry(); /* Add the callback to our list. */ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { @@ -2876,13 +2726,15 @@ EXPORT_SYMBOL_GPL(call_rcu); /** * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers + * @list: List node. All blocks are linked between each other + * @gp_snap: Snapshot of RCU state for objects placed to this bulk * @nr_records: Number of active pointers in the array - * @next: Next bulk object in the block chain * @records: Array of the kvfree_rcu() pointers */ struct kvfree_rcu_bulk_data { + struct list_head list; + unsigned long gp_snap; unsigned long nr_records; - struct kvfree_rcu_bulk_data *next; void *records[]; }; @@ -2898,26 +2750,28 @@ struct kvfree_rcu_bulk_data { * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period + * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; + struct list_head bulk_head_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period + * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" + * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @initialized: The @rcu_work fields have been initialized - * @count: Number of objects for which GP not started + * @head_count: Number of objects in rcu_head singular list + * @bulk_count: Number of objects in bulk-list * @bkvcache: * A simple cache list that contains objects for reuse purpose. * In order to save some per-cpu space the list is singular. @@ -2935,13 +2789,20 @@ struct kfree_rcu_cpu_work { * the interactions with the slab allocators. */ struct kfree_rcu_cpu { + // Objects queued on a linked list + // through their rcu_head structures. struct rcu_head *head; - struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; + unsigned long head_gp_snap; + atomic_t head_count; + + // Objects queued on a bulk-list. + struct list_head bulk_head[FREE_N_CHANNELS]; + atomic_t bulk_count[FREE_N_CHANNELS]; + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; bool initialized; - int count; struct delayed_work page_cache_work; atomic_t backoff_page_cache_fill; @@ -3029,29 +2890,87 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) return freed; } +static void +kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode, int idx) +{ + unsigned long flags; + int i; + + debug_rcu_bhead_unqueue(bnode); + + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, bnode->records[i], 0); + + vfree(bnode->records[i]); + } + } + rcu_lock_release(&rcu_callback_map); + + raw_spin_lock_irqsave(&krcp->lock, flags); + if (put_cached_bnode(krcp, bnode)) + bnode = NULL; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (bnode) + free_page((unsigned long) bnode); + + cond_resched_tasks_rcu_qs(); +} + +static void +kvfree_rcu_list(struct rcu_head *head) +{ + struct rcu_head *next; + + for (; head; head = next) { + void *ptr = (void *) head->func; + unsigned long offset = (void *) head - ptr; + + next = head->next; + debug_rcu_head_unqueue((struct rcu_head *)ptr); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); + + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) + kvfree(ptr); + + rcu_lock_release(&rcu_callback_map); + cond_resched_tasks_rcu_qs(); + } +} + /* * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bkvhead_free or ->head_free. + * It frees all the objects queued on ->bulk_head_free or ->head_free. */ static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; - struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; - struct rcu_head *head, *next; + struct kvfree_rcu_bulk_data *bnode, *n; + struct list_head bulk_head[FREE_N_CHANNELS]; + struct rcu_head *head; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; - int i, j; + int i; krwp = container_of(to_rcu_work(work), - struct kfree_rcu_cpu_work, rcu_work); + struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; raw_spin_lock_irqsave(&krcp->lock, flags); // Channels 1 and 2. - for (i = 0; i < FREE_N_CHANNELS; i++) { - bkvhead[i] = krwp->bkvhead_free[i]; - krwp->bkvhead_free[i] = NULL; - } + for (i = 0; i < FREE_N_CHANNELS; i++) + list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); // Channel 3. head = krwp->head_free; @@ -3060,39 +2979,9 @@ static void kfree_rcu_work(struct work_struct *work) // Handle the first two channels. for (i = 0; i < FREE_N_CHANNELS; i++) { - for (; bkvhead[i]; bkvhead[i] = bnext) { - bnext = bkvhead[i]->next; - debug_rcu_bhead_unqueue(bkvhead[i]); - - rcu_lock_acquire(&rcu_callback_map); - if (i == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bkvhead[i]->nr_records, - bkvhead[i]->records); - - kfree_bulk(bkvhead[i]->nr_records, - bkvhead[i]->records); - } else { // vmalloc() / vfree(). - for (j = 0; j < bkvhead[i]->nr_records; j++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, - bkvhead[i]->records[j], 0); - - vfree(bkvhead[i]->records[j]); - } - } - rcu_lock_release(&rcu_callback_map); - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (put_cached_bnode(krcp, bkvhead[i])) - bkvhead[i] = NULL; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (bkvhead[i]) - free_page((unsigned long) bkvhead[i]); - - cond_resched_tasks_rcu_qs(); - } + // Start from the tail page, so a GP is likely passed for it. + list_for_each_entry_safe(bnode, n, &bulk_head[i], list) + kvfree_rcu_bulk(krcp, bnode, i); } /* @@ -3102,21 +2991,7 @@ static void kfree_rcu_work(struct work_struct *work) * queued on a linked list through their rcu_head structures. * This list is named "Channel 3". */ - for (; head; head = next) { - unsigned long offset = (unsigned long)head->func; - void *ptr = (void *)head - offset; - - next = head->next; - debug_rcu_head_unqueue((struct rcu_head *)ptr); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); - - rcu_lock_release(&rcu_callback_map); - cond_resched_tasks_rcu_qs(); - } + kvfree_rcu_list(head); } static bool @@ -3125,10 +3000,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp) int i; for (i = 0; i < FREE_N_CHANNELS; i++) - if (krcp->bkvhead[i]) + if (!list_empty(&krcp->bulk_head[i])) return true; - return !!krcp->head; + return !!READ_ONCE(krcp->head); +} + +static int krc_count(struct kfree_rcu_cpu *krcp) +{ + int sum = atomic_read(&krcp->head_count); + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + sum += atomic_read(&krcp->bulk_count[i]); + + return sum; } static void @@ -3136,7 +3022,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) { long delay, delay_left; - delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; if (delayed_work_pending(&krcp->monitor_work)) { delay_left = krcp->monitor_work.timer.expires - jiffies; if (delay < delay_left) @@ -3146,6 +3032,44 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) queue_delayed_work(system_wq, &krcp->monitor_work, delay); } +static void +kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) +{ + struct list_head bulk_ready[FREE_N_CHANNELS]; + struct kvfree_rcu_bulk_data *bnode, *n; + struct rcu_head *head_ready = NULL; + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&krcp->lock, flags); + for (i = 0; i < FREE_N_CHANNELS; i++) { + INIT_LIST_HEAD(&bulk_ready[i]); + + list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { + if (!poll_state_synchronize_rcu(bnode->gp_snap)) + break; + + atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); + list_move(&bnode->list, &bulk_ready[i]); + } + } + + if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { + head_ready = krcp->head; + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); + } + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + for (i = 0; i < FREE_N_CHANNELS; i++) { + list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) + kvfree_rcu_bulk(krcp, bnode, i); + } + + if (head_ready) + kvfree_rcu_list(head_ready); +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3156,26 +3080,31 @@ static void kfree_rcu_monitor(struct work_struct *work) unsigned long flags; int i, j; + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + raw_spin_lock_irqsave(&krcp->lock, flags); // Attempt to start a new batch. for (i = 0; i < KFREE_N_BATCHES; i++) { struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - // Try to detach bkvhead or head and attach it over any + // Try to detach bulk_head or head and attach it over any // available corresponding free channel. It can be that // a previous RCU batch is in progress, it means that // immediately to queue another one is not possible so // in that case the monitor work is rearmed. - if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || - (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || - (krcp->head && !krwp->head_free)) { + if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) || + (!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) || + (READ_ONCE(krcp->head) && !krwp->head_free)) { + // Channel 1 corresponds to the SLAB-pointer bulk path. // Channel 2 corresponds to vmalloc-pointer bulk path. for (j = 0; j < FREE_N_CHANNELS; j++) { - if (!krwp->bkvhead_free[j]) { - krwp->bkvhead_free[j] = krcp->bkvhead[j]; - krcp->bkvhead[j] = NULL; + if (list_empty(&krwp->bulk_head_free[j])) { + atomic_set(&krcp->bulk_count[j], 0); + list_replace_init(&krcp->bulk_head[j], + &krwp->bulk_head_free[j]); } } @@ -3183,11 +3112,10 @@ static void kfree_rcu_monitor(struct work_struct *work) // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; - krcp->head = NULL; + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); } - WRITE_ONCE(krcp->count, 0); - // One work is per one batch, so there are three // "free channels", the batch can handle. It can // be that the work is in the pending state when @@ -3197,6 +3125,8 @@ static void kfree_rcu_monitor(struct work_struct *work) } } + raw_spin_unlock_irqrestore(&krcp->lock, flags); + // If there is nothing to detach, it means that our job is // successfully done here. In case of having at least one // of the channels that is still busy we should rearm the @@ -3204,8 +3134,6 @@ static void kfree_rcu_monitor(struct work_struct *work) // still in progress. if (need_offload_krc(krcp)) schedule_delayed_monitor_work(krcp); - - raw_spin_unlock_irqrestore(&krcp->lock, flags); } static enum hrtimer_restart @@ -3288,10 +3216,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return false; idx = !!is_vmalloc_addr(ptr); + bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], + struct kvfree_rcu_bulk_data, list); /* Check if a new block is required. */ - if (!(*krcp)->bkvhead[idx] || - (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { + if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(*krcp); if (!bnode && can_alloc) { krc_this_cpu_unlock(*krcp, *flags); @@ -3315,17 +3244,15 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, if (!bnode) return false; - /* Initialize the new block. */ + // Initialize the new block and attach it. bnode->nr_records = 0; - bnode->next = (*krcp)->bkvhead[idx]; - - /* Attach it to the head. */ - (*krcp)->bkvhead[idx] = bnode; + list_add(&bnode->list, &(*krcp)->bulk_head[idx]); } - /* Finally insert. */ - (*krcp)->bkvhead[idx]->records - [(*krcp)->bkvhead[idx]->nr_records++] = ptr; + // Finally insert and update the GP for this page. + bnode->records[bnode->nr_records++] = ptr; + bnode->gp_snap = get_state_synchronize_rcu(); + atomic_inc(&(*krcp)->bulk_count[idx]); return true; } @@ -3342,26 +3269,21 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, * be free'd in workqueue context. This allows us to: batch requests together to * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, void *ptr) { unsigned long flags; struct kfree_rcu_cpu *krcp; bool success; - void *ptr; - if (head) { - ptr = (void *) head - (unsigned long) func; - } else { - /* - * Please note there is a limitation for the head-less - * variant, that is why there is a clear rule for such - * objects: it can be used from might_sleep() context - * only. For other places please embed an rcu_head to - * your data. - */ + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + if (!head) might_sleep(); - ptr = (unsigned long *) func; - } // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { @@ -3382,14 +3304,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Inline if kvfree_rcu(one_arg) call. goto unlock_return; - head->func = func; + head->func = ptr; head->next = krcp->head; - krcp->head = head; + WRITE_ONCE(krcp->head, head); + atomic_inc(&krcp->head_count); + + // Take a snapshot for this krcp. + krcp->head_gp_snap = get_state_synchronize_rcu(); success = true; } - WRITE_ONCE(krcp->count, krcp->count + 1); - // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_monitor_work(krcp); @@ -3420,7 +3344,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - count += READ_ONCE(krcp->count); + count += krc_count(krcp); count += READ_ONCE(krcp->nr_bkv_objs); atomic_set(&krcp->backoff_page_cache_fill, 1); } @@ -3437,7 +3361,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - count = krcp->count; + count = krc_count(krcp); count += drain_page_cache(krcp); kfree_rcu_monitor(&krcp->monitor_work.work); @@ -3461,15 +3385,12 @@ static struct shrinker kfree_rcu_shrinker = { void __init kfree_rcu_scheduler_running(void) { int cpu; - unsigned long flags; for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - raw_spin_lock_irqsave(&krcp->lock, flags); if (need_offload_krc(krcp)) schedule_delayed_monitor_work(krcp); - raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3485,9 +3406,10 @@ void __init kfree_rcu_scheduler_running(void) */ static int rcu_blocking_is_gp(void) { - if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) { + might_sleep(); return false; - might_sleep(); /* Check for RCU read-side critical section. */ + } return true; } @@ -3711,7 +3633,9 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @oldstate - * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited() + * on the one hand or by directly invoking either synchronize_rcu() or + * synchronize_rcu_expedited() on the other. * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for @@ -3722,6 +3646,12 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); * completed. Alternatively, they can use get_completed_synchronize_rcu() * to get a guaranteed-completed grace-period state. * + * In addition, because oldstate compresses the grace-period state for + * both normal and expedited grace periods into a single unsigned long, + * it can miss a grace period when synchronize_rcu() runs concurrently + * with synchronize_rcu_expedited(). If this is unacceptable, please + * instead use the _full() variant of these polling APIs. + * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call * to the function that provided @oldstate, and that returned at the end @@ -4079,6 +4009,155 @@ void rcu_barrier(void) } EXPORT_SYMBOL_GPL(rcu_barrier); +/* + * Compute the mask of online CPUs for the specified rcu_node structure. + * This will not be stable unless the rcu_node structure's ->lock is + * held, but the bit corresponding to the current CPU will be stable + * in most contexts. + */ +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +{ + return READ_ONCE(rnp->qsmaskinitnext); +} + +/* + * Is the CPU corresponding to the specified rcu_data structure online + * from RCU's perspective? This perspective is given by that structure's + * ->qsmaskinitnext field rather than by the global cpu_online_mask. + */ +static bool rcu_rdp_cpu_online(struct rcu_data *rdp) +{ + return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); +} + +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) + +/* + * Is the current CPU online as far as RCU is concerned? + * + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. + * + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway. In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. + */ +bool rcu_lockdep_current_cpu_online(void) +{ + struct rcu_data *rdp; + bool ret = false; + + if (in_nmi() || !rcu_scheduler_fully_active) + return true; + preempt_disable_notrace(); + rdp = this_cpu_ptr(&rcu_data); + /* + * Strictly, we care here about the case where the current CPU is + * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask + * not being up to date. So arch_spin_is_locked() might have a + * false positive if it's held by some *other* CPU, but that's + * OK because that just means a false *negative* on the warning. + */ + if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) + ret = true; + preempt_enable_notrace(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ + +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!rcu_state.n_online_cpus; +} + +/* + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ + bool blkd; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rdp->mynode; + + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return 0; + + blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); + return 0; +} + +/* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated. + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + raw_lockdep_assert_held_rcu_node(rnp_leaf); + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + WARN_ON_ONCE(rnp_leaf->qsmaskinit) || + WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + /* Between grace periods, so better already be zero! */ + WARN_ON_ONCE(rnp->qsmask); + if (rnp->qsmaskinit) { + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ + return; + } + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } +} + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return 0; + + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); + return 0; +} + /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller @@ -4408,11 +4487,13 @@ static int rcu_pm_notify(struct notifier_block *self, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: + rcu_async_hurry(); rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: rcu_unexpedite_gp(); + rcu_async_relax(); break; default: break; @@ -4766,7 +4847,7 @@ struct workqueue_struct *rcu_gp_wq; static void __init kfree_rcu_batch_init(void) { int cpu; - int i; + int i, j; /* Clamp it to [0:100] seconds interval. */ if (rcu_delay_page_cache_fill_msec < 0 || @@ -4786,8 +4867,14 @@ static void __init kfree_rcu_batch_init(void) for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; + + for (j = 0; j < FREE_N_CHANNELS; j++) + INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); } + for (i = 0; i < FREE_N_CHANNELS; i++) + INIT_LIST_HEAD(&krcp->bulk_head[i]); + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; @@ -4838,6 +4925,8 @@ void __init rcu_init(void) // Kick-start any polled grace periods that started early. if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) (void)start_poll_synchronize_rcu_expedited(); + + rcu_test_sync_prims(); } #include "tree_stall.h" diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index ed6c3cce28f23..249c2967d9e6c 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -11,6 +11,7 @@ static void rcu_exp_handler(void *unused); static int rcu_print_task_exp_stall(struct rcu_node *rnp); +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp); /* * Record the start of an expedited grace period. @@ -667,8 +668,11 @@ static void synchronize_rcu_expedited_wait(void) mask = leaf_node_cpu_bit(rnp, cpu); if (!(READ_ONCE(rnp->expmask) & mask)) continue; + preempt_disable(); // For smp_processor_id() in dump_cpu_task(). dump_cpu_task(cpu); + preempt_enable(); } + rcu_exp_print_detail_task_stall_rnp(rnp); } jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; panic_on_rcu_stall(); @@ -811,6 +815,36 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) return ndetected; } +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, dumping the stack of each that is blocking the current + * expedited grace period. + */ +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ + unsigned long flags; + struct task_struct *t; + + if (!rcu_exp_stall_task_details) + return; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!READ_ONCE(rnp->exp_tasks)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); + sched_show_task(t); + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + #else /* #ifdef CONFIG_PREEMPT_RCU */ /* Request an expedited quiescent state. */ @@ -883,6 +917,15 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) return 0; } +/* + * Because preemptible RCU does not exist, we never have to print out + * tasks blocked within RCU read-side critical sections that are blocking + * the current expedited grace period. + */ +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +} + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /** diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f5e6a2f95a2a0..624ff75e08ab1 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -144,8 +144,45 @@ bool rcu_gp_is_normal(void) } EXPORT_SYMBOL_GPL(rcu_gp_is_normal); -static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); +static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1); +/* + * Should call_rcu() callbacks be processed with urgency or are + * they OK being executed with arbitrary delays? + */ +bool rcu_async_should_hurry(void) +{ + return !IS_ENABLED(CONFIG_RCU_LAZY) || + atomic_read(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_should_hurry); + +/** + * rcu_async_hurry - Make future async RCU callbacks not lazy. + * + * After a call to this function, future calls to call_rcu() + * will be processed in a timely fashion. + */ +void rcu_async_hurry(void) +{ + if (IS_ENABLED(CONFIG_RCU_LAZY)) + atomic_inc(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_hurry); +/** + * rcu_async_relax - Make future async RCU callbacks lazy. + * + * After a call to this function, future calls to call_rcu() + * will be processed in a lazy fashion. + */ +void rcu_async_relax(void) +{ + if (IS_ENABLED(CONFIG_RCU_LAZY)) + atomic_dec(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_relax); + +static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); /* * Should normal grace-period primitives be expedited? Intended for * use within RCU. Note that this function takes the rcu_expedited @@ -195,6 +232,7 @@ static bool rcu_boot_ended __read_mostly; void rcu_end_inkernel_boot(void) { rcu_unexpedite_gp(); + rcu_async_relax(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); rcu_boot_ended = true; @@ -220,6 +258,7 @@ void rcu_test_sync_prims(void) { if (!IS_ENABLED(CONFIG_PROVE_RCU)) return; + pr_info("Running RCU synchronous self tests\n"); synchronize_rcu(); synchronize_rcu_expedited(); } @@ -508,6 +547,8 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; module_param(rcu_exp_cpu_stall_timeout, int, 0644); +bool rcu_exp_stall_task_details __read_mostly; +module_param(rcu_exp_stall_task_details, bool, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall @@ -555,9 +596,12 @@ struct early_boot_kfree_rcu { static void early_boot_test_call_rcu(void) { static struct rcu_head head; + int idx; static struct rcu_head shead; struct early_boot_kfree_rcu *rhp; + idx = srcu_down_read(&early_srcu); + srcu_up_read(&early_srcu, idx); call_rcu(&head, test_callback); early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu); call_srcu(&early_srcu, &shead, test_callback); @@ -586,6 +630,7 @@ static int rcu_verify_early_boot_tests(void) early_boot_test_counter++; srcu_barrier(&early_srcu); WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie)); + cleanup_srcu_struct(&early_srcu); } if (rcu_self_test_counter != early_boot_test_counter) { WARN_ON(1); diff --git a/kernel/torture.c b/kernel/torture.c index 789aeb0e1159c..1a0519b836ac9 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -450,7 +450,7 @@ unsigned long torture_random(struct torture_random_state *trsp) { if (--trsp->trs_count < 0) { - trsp->trs_state += (unsigned long)local_clock(); + trsp->trs_state += (unsigned long)local_clock() + raw_smp_processor_id(); trsp->trs_count = TORTURE_RANDOM_REFRESH; } trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT + @@ -915,7 +915,7 @@ void torture_kthread_stopping(char *title) VERBOSE_TOROUT_STRING(buf); while (!kthread_should_stop()) { torture_shutdown_absorb(title); - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } } EXPORT_SYMBOL_GPL(torture_kthread_stopping); diff --git a/tools/testing/selftests/rcutorture/bin/configcheck.sh b/tools/testing/selftests/rcutorture/bin/configcheck.sh index 83fac1852ab23..b92dfeb7fbbfe 100755 --- a/tools/testing/selftests/rcutorture/bin/configcheck.sh +++ b/tools/testing/selftests/rcutorture/bin/configcheck.sh @@ -10,10 +10,9 @@ T="`mktemp -d ${TMPDIR-/tmp}/configcheck.sh.XXXXXX`" trap 'rm -rf $T' 0 -cat $1 > $T/.config +sed -e 's/"//g' < $1 > $T/.config -cat $2 | sed -e 's/\(.*\)=n/# \1 is not set/' -e 's/^#CHECK#//' | -grep -v '^CONFIG_INITRAMFS_SOURCE' | +sed -e 's/"//g' -e 's/\(.*\)=n/# \1 is not set/' -e 's/^#CHECK#//' < $2 | awk ' { print "if grep -q \"" $0 "\" < '"$T/.config"'"; diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh index 69f8a5958cefc..aad51e7c0183d 100755 --- a/tools/testing/selftests/rcutorture/bin/console-badness.sh +++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh @@ -10,7 +10,7 @@ # # Authors: Paul E. McKenney -egrep 'Badness|WARNING:|Warn|BUG|===========|BUG: KCSAN:|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' | +grep -E 'Badness|WARNING:|Warn|BUG|===========|BUG: KCSAN:|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' | grep -v 'ODEBUG: ' | grep -v 'This means that this is a DEBUG kernel and it is' | grep -v 'Warning: unable to open an initial console' | diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index e28a82851f7c4..11f8d232b0ee7 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -44,10 +44,10 @@ fi ncpus="`getconf _NPROCESSORS_ONLN`" make -j$((2 * ncpus)) $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1 retval=$? -if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out +if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | grep -E -q "Stop|Error|error:|warning:" || grep -E -q "Stop|Error|error:" < $resdir/Make.out then echo Kernel build error - egrep "Stop|Error|error:|warning:" < $resdir/Make.out + grep -E "Stop|Error|error:|warning:" < $resdir/Make.out echo Run aborted. exit 3 fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 88983cba79563..28981007465bd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -32,11 +32,11 @@ for i in ${rundir}/*/Make.out do scenariodir="`dirname $i`" scenariobasedir="`echo ${scenariodir} | sed -e 's/\.[0-9]*$//'`" - if egrep -q "error:|warning:|^ld: .*undefined reference to" < $i + if grep -E -q "error:|warning:|^ld: .*undefined reference to" < $i then - egrep "error:|warning:|^ld: .*undefined reference to" < $i > $i.diags + grep -E "error:|warning:|^ld: .*undefined reference to" < $i > $i.diags files="$files $i.diags $i" - elif ! test -f ${scenariobasedir}/vmlinux && ! test -f "${rundir}/re-run" + elif ! test -f ${scenariobasedir}/vmlinux && ! test -f ${scenariobasedir}/vmlinux.xz && ! test -f "${rundir}/re-run" then echo No ${scenariobasedir}/vmlinux file > $i.diags files="$files $i.diags $i" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 7710b1e1cddab..62f3b0f56e4d7 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -186,7 +186,7 @@ do fi ;; --kconfig|--kconfigs) - checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$' + checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\)*$' '^error$' TORTURE_KCONFIG_ARG="`echo "$TORTURE_KCONFIG_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`" shift ;; @@ -585,7 +585,7 @@ awk < $T/cfgcpu.pack \ echo kvm-end-run-stats.sh "$resdir/$ds" "$starttime" >> $T/script # Extract the tests and their batches from the script. -egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | +grep -E 'Start batch|Starting build\.' $T/script | grep -v ">>" | sed -e 's/:.*$//' -e 's/^echo //' -e 's/-ovf//' | awk ' /^----Start/ { @@ -622,7 +622,7 @@ then elif test "$dryrun" = sched then # Extract the test run schedule from the script. - egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | + grep -E 'Start batch|Starting build\.' $T/script | grep -v ">>" | sed -e 's/:.*$//' -e 's/^echo //' nbuilds="`grep 'Starting build\.' $T/script | grep -v ">>" | sed -e 's/:.*$//' -e 's/^echo //' | diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 822eb037a0573..9ab0f6bc172cd 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -65,7 +65,7 @@ then fi grep --binary-files=text 'torture:.*ver:' $file | - egrep --binary-files=text -v '\(null\)|rtc: 000000000* ' | + grep -E --binary-files=text -v '\(null\)|rtc: 000000000* ' | sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' | sed -e 's/^.*ver: //' | awk ' @@ -128,17 +128,17 @@ then then summary="$summary Badness: $n_badness" fi - n_warn=`grep -v 'Warning: unable to open an initial console' $file | grep -v 'Warning: Failed to add ttynull console. No stdin, stdout, and stderr for the init process' | egrep -c 'WARNING:|Warn'` + n_warn=`grep -v 'Warning: unable to open an initial console' $file | grep -v 'Warning: Failed to add ttynull console. No stdin, stdout, and stderr for the init process' | grep -E -c 'WARNING:|Warn'` if test "$n_warn" -ne 0 then summary="$summary Warnings: $n_warn" fi - n_bugs=`egrep -c '\bBUG|Oops:' $file` + n_bugs=`grep -E -c '\bBUG|Oops:' $file` if test "$n_bugs" -ne 0 then summary="$summary Bugs: $n_bugs" fi - n_kcsan=`egrep -c 'BUG: KCSAN: ' $file` + n_kcsan=`grep -E -c 'BUG: KCSAN: ' $file` if test "$n_kcsan" -ne 0 then if test "$n_bugs" = "$n_kcsan" @@ -158,7 +158,7 @@ then then summary="$summary lockdep: $n_badness" fi - n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $file` + n_stalls=`grep -E -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $file` if test "$n_stalls" -ne 0 then summary="$summary Stalls: $n_stalls"