From bee9182d955227f01ff3b80c4cb6acca9bb40b11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Jul 2015 23:48:20 +0200 Subject: [PATCH 1/8] introduce __sb_writers_{acquired,release}() helpers Preparation to hide the sb->s_writers internals from xfs and btrfs. Add 2 trivial define's they can use rather than play with ->s_writers directly. No changes in btrfs/transaction.o and xfs/xfs_aops.o. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/btrfs/transaction.c | 8 ++------ fs/xfs/xfs_aops.c | 6 ++---- include/linux/fs.h | 5 +++++ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f5021fcb154e3..a8ab8f5ef38e1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1638,9 +1638,7 @@ static void do_async_commit(struct work_struct *work) * Tell lockdep about it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS); current->journal_info = ac->newtrans; @@ -1679,9 +1677,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * async commit thread will be the one to unlock it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_release( - &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3859f5e27a4dc..9bbb3507376a7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc( * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ - rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); /* * We hand off the transaction to the completion thread now, so * clear the flag here. @@ -171,8 +170,7 @@ xfs_setfilesize_ioend( * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); - rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 84b783f277f76..acb7cad84edd2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1391,6 +1391,11 @@ extern struct timespec current_fs_time(struct super_block *sb); void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); +#define __sb_writers_acquired(sb, lev) \ + rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to From f4b554af9931585174d4913b482eacab75858964 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jul 2015 00:50:55 +0200 Subject: [PATCH 2/8] fix the broken lockdep logic in __sb_start_write() 1. wait_event(frozen < level) without rwsem_acquire_read() is just wrong from lockdep perspective. If we are going to deadlock because the caller is buggy, lockdep can't detect this problem. 2. __sb_start_write() can race with thaw_super() + freeze_super(), and after "goto retry" the 2nd acquire_freeze_lock() is wrong. 3. The "tell lockdep we are doing trylock" hack doesn't look nice. I think this is correct, but this logic should be more explicit. Yes, the recursive read_lock() is fine if we hold the lock on a higher level. But we do not need to fool lockdep. If we can not deadlock in this case then try-lock must not fail and we can use use wait == F throughout this code. Note: as Dave Chinner explains, the "trylock" hack and the fat comment can be probably removed. But this needs a separate change and it will be trivial: just kill __sb_start_write() and rename do_sb_start_write() back to __sb_start_write(). Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/super.c | 73 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/fs/super.c b/fs/super.c index b61372354f2bd..24a76bcd62a59 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1158,38 +1158,11 @@ void __sb_end_write(struct super_block *sb, int level) } EXPORT_SYMBOL(__sb_end_write); -#ifdef CONFIG_LOCKDEP -/* - * We want lockdep to tell us about possible deadlocks with freezing but - * it's it bit tricky to properly instrument it. Getting a freeze protection - * works as getting a read lock but there are subtle problems. XFS for example - * gets freeze protection on internal level twice in some cases, which is OK - * only because we already hold a freeze protection also on higher level. Due - * to these cases we have to tell lockdep we are doing trylock when we - * already hold a freeze protection for a higher freeze level. - */ -static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, +static int do_sb_start_write(struct super_block *sb, int level, bool wait, unsigned long ip) { - int i; - - if (!trylock) { - for (i = 0; i < level - 1; i++) - if (lock_is_held(&sb->s_writers.lock_map[i])) { - trylock = true; - break; - } - } - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); -} -#endif - -/* - * This is an internal function, please use sb_start_{write,pagefault,intwrite} - * instead. - */ -int __sb_start_write(struct super_block *sb, int level, bool wait) -{ + if (wait) + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip); retry: if (unlikely(sb->s_writers.frozen >= level)) { if (!wait) @@ -1198,9 +1171,6 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) sb->s_writers.frozen < level); } -#ifdef CONFIG_LOCKDEP - acquire_freeze_lock(sb, level, !wait, _RET_IP_); -#endif percpu_counter_inc(&sb->s_writers.counter[level-1]); /* * Make sure counter is updated before we check for frozen. @@ -1211,8 +1181,45 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) __sb_end_write(sb, level); goto retry; } + + if (!wait) + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip); return 1; } + +/* + * This is an internal function, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +int __sb_start_write(struct super_block *sb, int level, bool wait) +{ + bool force_trylock = false; + int ret; + +#ifdef CONFIG_LOCKDEP + /* + * We want lockdep to tell us about possible deadlocks with freezing + * but it's it bit tricky to properly instrument it. Getting a freeze + * protection works as getting a read lock but there are subtle + * problems. XFS for example gets freeze protection on internal level + * twice in some cases, which is OK only because we already hold a + * freeze protection also on higher level. Due to these cases we have + * to use wait == F (trylock mode) which must not fail. + */ + if (wait) { + int i; + + for (i = 0; i < level - 1; i++) + if (lock_is_held(&sb->s_writers.lock_map[i])) { + force_trylock = true; + break; + } + } +#endif + ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_); + WARN_ON(force_trylock & !ret); + return ret; +} EXPORT_SYMBOL(__sb_start_write); /** From 0e28e01f1e73015d8e1b8fa1cda071d0bd9a2600 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 11 Aug 2015 16:28:29 +0200 Subject: [PATCH 3/8] document rwsem_release() in sb_wait_write() Not only we need to avoid the warning from lockdep_sys_exit(), the caller of freeze_super() can never release this lock. Another thread can do this, so there is another reason for rwsem_release(). Plus the comment should explain why we have to fool lockdep. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/super.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/super.c b/fs/super.c index 24a76bcd62a59..8aa3cbc571d18 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1236,11 +1236,17 @@ static void sb_wait_write(struct super_block *sb, int level) { s64 writers; + rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); /* - * We just cycle-through lockdep here so that it does not complain - * about returning with lock to userspace + * We are going to return to userspace and forget about this lock, the + * ownership goes to the caller of thaw_super() which does unlock. + * + * FIXME: we should do this before return from freeze_super() after we + * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() + * should re-acquire these locks before s_op->unfreeze_fs(sb). However + * this leads to lockdep false-positives, so currently we do the early + * release right after acquire. */ - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); do { From 9287f6925ad9d8fb8c6283066b4f77fd87f123a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 17:45:57 +0200 Subject: [PATCH 4/8] percpu-rwsem: introduce percpu_down_read_trylock() Add percpu_down_read_trylock(), it will have the user soon. Signed-off-by: Oleg Nesterov --- include/linux/percpu-rwsem.h | 1 + kernel/locking/percpu-rwsem.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 3e88c9a7d57f7..16c30cd335013 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -16,6 +16,7 @@ struct percpu_rw_semaphore { }; extern void percpu_down_read(struct percpu_rw_semaphore *); +extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); extern void percpu_up_read(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 652a8ee8efe95..f325672548673 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw) __up_read(&brw->rw_sem); } +int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) +{ + if (unlikely(!update_fast_ctr(brw, +1))) { + if (!__down_read_trylock(&brw->rw_sem)) + return 0; + atomic_inc(&brw->slow_read_ctr); + __up_read(&brw->rw_sem); + } + + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + return 1; +} + void percpu_up_read(struct percpu_rw_semaphore *brw) { rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); From 55cc156505f2e43fa45dbd4bfe8f9c9d848ca44c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 20:26:44 +0200 Subject: [PATCH 5/8] percpu-rwsem: introduce percpu_rwsem_release() and percpu_rwsem_acquire() Add percpu_rwsem_release() and percpu_rwsem_acquire() for the users which need to return to userspace with percpu-rwsem lock held and/or pass the ownership to another thread. TODO: change percpu_rwsem_release() to use rwsem_clear_owner(). We can either fold kernel/locking/rwsem.h into include/linux/rwsem.h, or add the non-inline percpu_rwsem_clear_owner(). Signed-off-by: Oleg Nesterov --- include/linux/percpu-rwsem.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 16c30cd335013..834c4e52cb2d1 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -32,4 +32,23 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); __percpu_init_rwsem(brw, #brw, &rwsem_key); \ }) + +#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) + +static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_release(&sem->rw_sem.dep_map, 1, ip); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + if (!read) + sem->rw_sem.owner = NULL; +#endif +} + +static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip); +} + #endif From bf3eac84c42da7017610abc8cfba64921ea92c76 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 11 Aug 2015 17:26:29 +0200 Subject: [PATCH 6/8] percpu-rwsem: kill CONFIG_PERCPU_RWSEM Remove CONFIG_PERCPU_RWSEM, the next patch adds the unconditional user of percpu_rw_semaphore. Signed-off-by: Oleg Nesterov --- arch/Kconfig | 1 - init/Kconfig | 1 - kernel/locking/Makefile | 3 +-- lib/Kconfig | 3 --- 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 8a8ea7110de84..8f3638674e05a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -87,7 +87,6 @@ config KPROBES_ON_FTRACE config UPROBES def_bool n - select PERCPU_RWSEM help Uprobes is the user-space counterpart to kprobes: they enable instrumentation applications (such as 'perf probe') diff --git a/init/Kconfig b/init/Kconfig index af09b4fb43d29..288c0122c2a53 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -925,7 +925,6 @@ config NUMA_BALANCING_DEFAULT_ENABLED menuconfig CGROUPS bool "Control Group support" select KERNFS - select PERCPU_RWSEM help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 7dd5c9918e4c2..4c6a97e1a8493 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -25,6 +25,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o -obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/lib/Kconfig b/lib/Kconfig index 3a2ef67db6c72..f6aa03dc15760 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -53,9 +53,6 @@ config GENERIC_IO config STMP_DEVICE bool -config PERCPU_RWSEM - bool - config ARCH_USE_CMPXCHG_LOCKREF bool From 853b39a7c82826b8413048feec7bf08e98ce7a84 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jul 2015 20:21:13 +0200 Subject: [PATCH 7/8] shift percpu_counter_destroy() into destroy_super_work() Of course, this patch is ugly as hell. It will be (partially) reverted later. We add it to ensure that other WIP changes in percpu_rw_semaphore won't break fs/super.c. We do not even need this change right now, percpu_free_rwsem() is fine in atomic context. But we are going to change this, it will be might_sleep() after we merge the rcu_sync() patches. And even after that we do not really need destroy_super_work(), we will kill it in any case. Instead, destroy_super_rcu() should just check that rss->cb_state == CB_IDLE and do call_rcu() again in the (very unlikely) case this is not true. So this is just the temporary kludge which helps us to avoid the conflicts with the changes which will be (hopefully) routed via rcu tree. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/super.c | 23 +++++++++++++++++++---- include/linux/fs.h | 3 ++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/super.c b/fs/super.c index 8aa3cbc571d18..c937bd7b4d33b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink, return total_objects; } +static void destroy_super_work(struct work_struct *work) +{ + struct super_block *s = container_of(work, struct super_block, + destroy_work); + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_counter_destroy(&s->s_writers.counter[i]); + kfree(s); +} + +static void destroy_super_rcu(struct rcu_head *head) +{ + struct super_block *s = container_of(head, struct super_block, rcu); + INIT_WORK(&s->destroy_work, destroy_super_work); + schedule_work(&s->destroy_work); +} + /** * destroy_super - frees a superblock * @s: superblock to free @@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink, */ static void destroy_super(struct super_block *s) { - int i; list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - for (i = 0; i < SB_FREEZE_LEVELS; i++) - percpu_counter_destroy(&s->s_writers.counter[i]); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); kfree(s->s_options); - kfree_rcu(s, rcu); + call_rcu(&s->rcu, destroy_super_rcu); } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index acb7cad84edd2..4bed78966c6bb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1375,7 +1376,7 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; - + struct work_struct destroy_work; /* * Indicates how deep in a filesystem stack this SB is */ From 8129ed29644bf56ed17ec1bbbeed5c568b43d6a0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 11 Aug 2015 17:05:04 +0200 Subject: [PATCH 8/8] change sb_writers to use percpu_rw_semaphore We can remove everything from struct sb_writers except frozen and add the array of percpu_rw_semaphore's instead. This patch doesn't remove sb_writers->wait_unfrozen yet, we keep it for get_super_thawed(). We will probably remove it later. This change tries to address the following problems: - Firstly, __sb_start_write() looks simply buggy. It does __sb_end_write() if it sees ->frozen, but if it migrates to another CPU before percpu_counter_dec(), sb_wait_write() can wrongly succeed if there is another task which holds the same "semaphore": sb_wait_write() can miss the result of the previous percpu_counter_inc() but see the result of this percpu_counter_dec(). - As Dave Hansen reports, it is suboptimal. The trivial microbenchmark that writes to a tmpfs file in a loop runs 12% faster if we change this code to rely on RCU and kill the memory barriers. - This code doesn't look simple. It would be better to rely on the generic locking code. According to Dave, this change adds the same performance improvement. Note: with this change both freeze_super() and thaw_super() will do synchronize_sched_expedited() 3 times. This is just ugly. But: - This will be "fixed" by the rcu_sync changes we are going to merge. After that freeze_super()->percpu_down_write() will use synchronize_sched(), and thaw_super() won't use synchronize() at all. This doesn't need any changes in fs/super.c. - Once we merge rcu_sync changes, we can also change super.c so that all wb_write->rw_sem's will share the single ->rss in struct sb_writes, then freeze_super() will need only one synchronize_sched(). Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/super.c | 111 ++++++++++++--------------------------------- include/linux/fs.h | 19 +++----- 2 files changed, 36 insertions(+), 94 deletions(-) diff --git a/fs/super.c b/fs/super.c index c937bd7b4d33b..767b1e10f6ad3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work) int i; for (i = 0; i < SB_FREEZE_LEVELS; i++) - percpu_counter_destroy(&s->s_writers.counter[i]); + percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } @@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0, - GFP_KERNEL) < 0) + if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], + sb_writers_name[i], + &type->s_writers_key[i])) goto fail; - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], - &type->s_writers_key[i], 0); } - init_waitqueue_head(&s->s_writers.wait); init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; @@ -1161,47 +1159,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) */ void __sb_end_write(struct super_block *sb, int level) { - percpu_counter_dec(&sb->s_writers.counter[level-1]); - /* - * Make sure s_writers are updated before we wake up waiters in - * freeze_super(). - */ - smp_mb(); - if (waitqueue_active(&sb->s_writers.wait)) - wake_up(&sb->s_writers.wait); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); + percpu_up_read(sb->s_writers.rw_sem + level-1); } EXPORT_SYMBOL(__sb_end_write); -static int do_sb_start_write(struct super_block *sb, int level, bool wait, - unsigned long ip) -{ - if (wait) - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip); -retry: - if (unlikely(sb->s_writers.frozen >= level)) { - if (!wait) - return 0; - wait_event(sb->s_writers.wait_unfrozen, - sb->s_writers.frozen < level); - } - - percpu_counter_inc(&sb->s_writers.counter[level-1]); - /* - * Make sure counter is updated before we check for frozen. - * freeze_super() first sets frozen and then checks the counter. - */ - smp_mb(); - if (unlikely(sb->s_writers.frozen >= level)) { - __sb_end_write(sb, level); - goto retry; - } - - if (!wait) - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip); - return 1; -} - /* * This is an internal function, please use sb_start_{write,pagefault,intwrite} * instead. @@ -1209,7 +1170,7 @@ static int do_sb_start_write(struct super_block *sb, int level, bool wait, int __sb_start_write(struct super_block *sb, int level, bool wait) { bool force_trylock = false; - int ret; + int ret = 1; #ifdef CONFIG_LOCKDEP /* @@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) int i; for (i = 0; i < level - 1; i++) - if (lock_is_held(&sb->s_writers.lock_map[i])) { + if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { force_trylock = true; break; } } #endif - ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_); + if (wait && !force_trylock) + percpu_down_read(sb->s_writers.rw_sem + level-1); + else + ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); + WARN_ON(force_trylock & !ret); return ret; } @@ -1243,15 +1208,11 @@ EXPORT_SYMBOL(__sb_start_write); * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file - * system. Caller of this function should make sure there can be no new writers - * of type @level before calling this function. Otherwise this function can - * livelock. + * system. */ static void sb_wait_write(struct super_block *sb, int level) { - s64 writers; - - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); + percpu_down_write(sb->s_writers.rw_sem + level-1); /* * We are going to return to userspace and forget about this lock, the * ownership goes to the caller of thaw_super() which does unlock. @@ -1262,24 +1223,18 @@ static void sb_wait_write(struct super_block *sb, int level) * this leads to lockdep false-positives, so currently we do the early * release right after acquire. */ - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); - - do { - DEFINE_WAIT(wait); + percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); +} - /* - * We use a barrier in prepare_to_wait() to separate setting - * of frozen and checking of the counter - */ - prepare_to_wait(&sb->s_writers.wait, &wait, - TASK_UNINTERRUPTIBLE); +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; - writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); - if (writers) - schedule(); + for (level = 0; level < SB_FREEZE_LEVELS; ++level) + percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); - finish_wait(&sb->s_writers.wait, &wait); - } while (writers); + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_up_write(sb->s_writers.rw_sem + level); } /** @@ -1338,20 +1293,14 @@ int freeze_super(struct super_block *sb) return 0; } - /* From now on, no new normal writers can start */ sb->s_writers.frozen = SB_FREEZE_WRITE; - smp_wmb(); - /* Release s_umount to preserve sb_start_write -> s_umount ordering */ up_write(&sb->s_umount); - sb_wait_write(sb, SB_FREEZE_WRITE); + down_write(&sb->s_umount); /* Now we go and block page faults... */ - down_write(&sb->s_umount); sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; - smp_wmb(); - sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ @@ -1359,7 +1308,6 @@ int freeze_super(struct super_block *sb) /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; - smp_wmb(); sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { @@ -1368,7 +1316,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1400,8 +1348,10 @@ int thaw_super(struct super_block *sb) return -EINVAL; } - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & MS_RDONLY) { + sb->s_writers.frozen = SB_UNFROZEN; goto out; + } if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -1413,12 +1363,11 @@ int thaw_super(struct super_block *sb) } } -out: sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); +out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); - return 0; } EXPORT_SYMBOL(thaw_super); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4bed78966c6bb..ce356f66cc2a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1,7 +1,6 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H - #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include #include @@ -1275,16 +1275,9 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { - /* Counters for counting writers at each level */ - struct percpu_counter counter[SB_FREEZE_LEVELS]; - wait_queue_head_t wait; /* queue for waiting for - writers / faults to finish */ - int frozen; /* Is sb frozen? */ - wait_queue_head_t wait_unfrozen; /* queue for waiting for - sb to be thawed */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map lock_map[SB_FREEZE_LEVELS]; -#endif + int frozen; /* Is sb frozen? */ + wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { @@ -1393,9 +1386,9 @@ void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); #define __sb_writers_acquired(sb, lev) \ - rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ - rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) /** * sb_end_write - drop write access to a superblock