Skip to content

Commit

Permalink
[PATCH] r/o bind mounts: track numbers of writers to mounts
Browse files Browse the repository at this point in the history
This is the real meat of the entire series.  It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time.  Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.

This uses a statically-allocated percpu variable.  All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances.  Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two

Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.

I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.

http://sr71.net/~dave/linux/openbench.c

The code in here is a a worst-possible case for this patch.  It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely.  This worst-case scenario causes a 3%
degredation in the benchmark.

I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory.  In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.

(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable.  So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)

[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
  • Loading branch information
Dave Hansen authored and Al Viro committed Apr 19, 2008
1 parent 2c463e9 commit 3d73363
Show file tree
Hide file tree
Showing 2 changed files with 244 additions and 15 deletions.
252 changes: 237 additions & 15 deletions fs/namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <linux/quotaops.h>
#include <linux/acct.h>
#include <linux/capability.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/sysfs.h>
#include <linux/seq_file.h>
Expand Down Expand Up @@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
return tmp & (HASH_SIZE - 1);
}

#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)

struct vfsmount *alloc_vfsmnt(const char *name)
{
struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
Expand All @@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
atomic_set(&mnt->__mnt_writers, 0);
if (name) {
int size = strlen(name) + 1;
char *newname = kmalloc(size, GFP_KERNEL);
Expand All @@ -80,6 +84,92 @@ struct vfsmount *alloc_vfsmnt(const char *name)
return mnt;
}

/*
* Most r/o checks on a fs are for operations that take
* discrete amounts of time, like a write() or unlink().
* We must keep track of when those operations start
* (for permission checks) and when they end, so that
* we can determine when writes are able to occur to
* a filesystem.
*/
/*
* __mnt_is_readonly: check whether a mount is read-only
* @mnt: the mount to check for its write status
*
* This shouldn't be used directly ouside of the VFS.
* It does not guarantee that the filesystem will stay
* r/w, just that it is right *now*. This can not and
* should not be used in place of IS_RDONLY(inode).
* mnt_want/drop_write() will _keep_ the filesystem
* r/w.
*/
int __mnt_is_readonly(struct vfsmount *mnt)
{
return (mnt->mnt_sb->s_flags & MS_RDONLY);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

struct mnt_writer {
/*
* If holding multiple instances of this lock, they
* must be ordered by cpu number.
*/
spinlock_t lock;
struct lock_class_key lock_class; /* compiles out with !lockdep */
unsigned long count;
struct vfsmount *mnt;
} ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);

static int __init init_mnt_writers(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
spin_lock_init(&writer->lock);
lockdep_set_class(&writer->lock, &writer->lock_class);
writer->count = 0;
}
return 0;
}
fs_initcall(init_mnt_writers);

static void unlock_mnt_writers(void)
{
int cpu;
struct mnt_writer *cpu_writer;

for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
spin_unlock(&cpu_writer->lock);
}
}

static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
{
if (!cpu_writer->mnt)
return;
/*
* This is in case anyone ever leaves an invalid,
* old ->mnt and a count of 0.
*/
if (!cpu_writer->count)
return;
atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
cpu_writer->count = 0;
}
/*
* must hold cpu_writer->lock
*/
static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
struct vfsmount *mnt)
{
if (cpu_writer->mnt == mnt)
return;
__clear_mnt_count(cpu_writer);
cpu_writer->mnt = mnt;
}

/*
* Most r/o checks on a fs are for operations that take
* discrete amounts of time, like a write() or unlink().
Expand All @@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name)
*/
int mnt_want_write(struct vfsmount *mnt)
{
if (__mnt_is_readonly(mnt))
return -EROFS;
return 0;
int ret = 0;
struct mnt_writer *cpu_writer;

cpu_writer = &get_cpu_var(mnt_writers);
spin_lock(&cpu_writer->lock);
if (__mnt_is_readonly(mnt)) {
ret = -EROFS;
goto out;
}
use_cpu_writer_for_mount(cpu_writer, mnt);
cpu_writer->count++;
out:
spin_unlock(&cpu_writer->lock);
put_cpu_var(mnt_writers);
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);

static void lock_mnt_writers(void)
{
int cpu;
struct mnt_writer *cpu_writer;

for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
spin_lock(&cpu_writer->lock);
__clear_mnt_count(cpu_writer);
cpu_writer->mnt = NULL;
}
}

/*
* These per-cpu write counts are not guaranteed to have
* matched increments and decrements on any given cpu.
* A file open()ed for write on one cpu and close()d on
* another cpu will imbalance this count. Make sure it
* does not get too far out of whack.
*/
static void handle_write_count_underflow(struct vfsmount *mnt)
{
if (atomic_read(&mnt->__mnt_writers) >=
MNT_WRITER_UNDERFLOW_LIMIT)
return;
/*
* It isn't necessary to hold all of the locks
* at the same time, but doing it this way makes
* us share a lot more code.
*/
lock_mnt_writers();
/*
* vfsmount_lock is for mnt_flags.
*/
spin_lock(&vfsmount_lock);
/*
* If coalescing the per-cpu writer counts did not
* get us back to a positive writer count, we have
* a bug.
*/
if ((atomic_read(&mnt->__mnt_writers) < 0) &&
!(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
printk(KERN_DEBUG "leak detected on mount(%p) writers "
"count: %d\n",
mnt, atomic_read(&mnt->__mnt_writers));
WARN_ON(1);
/* use the flag to keep the dmesg spam down */
mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
}
spin_unlock(&vfsmount_lock);
unlock_mnt_writers();
}

/**
* mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
Expand All @@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
*/
void mnt_drop_write(struct vfsmount *mnt)
{
int must_check_underflow = 0;
struct mnt_writer *cpu_writer;

cpu_writer = &get_cpu_var(mnt_writers);
spin_lock(&cpu_writer->lock);

use_cpu_writer_for_mount(cpu_writer, mnt);
if (cpu_writer->count > 0) {
cpu_writer->count--;
} else {
must_check_underflow = 1;
atomic_dec(&mnt->__mnt_writers);
}

spin_unlock(&cpu_writer->lock);
/*
* Logically, we could call this each time,
* but the __mnt_writers cacheline tends to
* be cold, and makes this expensive.
*/
if (must_check_underflow)
handle_write_count_underflow(mnt);
/*
* This could be done right after the spinlock
* is taken because the spinlock keeps us on
* the cpu, and disables preemption. However,
* putting it here bounds the amount that
* __mnt_writers can underflow. Without it,
* we could theoretically wrap __mnt_writers.
*/
put_cpu_var(mnt_writers);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

/*
* __mnt_is_readonly: check whether a mount is read-only
* @mnt: the mount to check for its write status
*
* This shouldn't be used directly ouside of the VFS.
* It does not guarantee that the filesystem will stay
* r/w, just that it is right *now*. This can not and
* should not be used in place of IS_RDONLY(inode).
*/
int __mnt_is_readonly(struct vfsmount *mnt)
int mnt_make_readonly(struct vfsmount *mnt)
{
return (mnt->mnt_sb->s_flags & MS_RDONLY);
int ret = 0;

lock_mnt_writers();
/*
* With all the locks held, this value is stable
*/
if (atomic_read(&mnt->__mnt_writers) > 0) {
ret = -EBUSY;
goto out;
}
/*
* actually set mount's r/o flag here to make
* __mnt_is_readonly() true, which keeps anyone
* from doing a successful mnt_want_write().
*/
out:
unlock_mnt_writers();
return ret;
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
Expand Down Expand Up @@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,

static inline void __mntput(struct vfsmount *mnt)
{
int cpu;
struct super_block *sb = mnt->mnt_sb;
/*
* We don't have to hold all of the locks at the
* same time here because we know that we're the
* last reference to mnt and that no new writers
* can come in.
*/
for_each_possible_cpu(cpu) {
struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
if (cpu_writer->mnt != mnt)
continue;
spin_lock(&cpu_writer->lock);
atomic_add(cpu_writer->count, &mnt->__mnt_writers);
cpu_writer->count = 0;
/*
* Might as well do this so that no one
* ever sees the pointer and expects
* it to be valid.
*/
cpu_writer->mnt = NULL;
spin_unlock(&cpu_writer->lock);
}
/*
* This probably indicates that somebody messed
* up a mnt_want/drop_write() pair. If this
* happens, the filesystem was probably unable
* to make r/w->r/o transitions.
*/
WARN_ON(atomic_read(&mnt->__mnt_writers));
dput(mnt->mnt_root);
free_vfsmnt(mnt);
deactivate_super(sb);
Expand Down
7 changes: 7 additions & 0 deletions include/linux/mount.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include <linux/types.h>
#include <linux/list.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <asm/atomic.h>

Expand All @@ -30,6 +31,7 @@ struct mnt_namespace;
#define MNT_RELATIME 0x20

#define MNT_SHRINKABLE 0x100
#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */

#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
Expand Down Expand Up @@ -62,6 +64,11 @@ struct vfsmount {
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
int mnt_ghosts;
/*
* This value is not stable unless all of the mnt_writers[] spinlocks
* are held, and all mnt_writer[]s on this mount have 0 as their ->count
*/
atomic_t __mnt_writers;
};

static inline struct vfsmount *mntget(struct vfsmount *mnt)
Expand Down

0 comments on commit 3d73363

Please sign in to comment.