Skip to content

Commit

Permalink
writeback: split inode_wb_list_lock into bdi_writeback.list_lock
Browse files Browse the repository at this point in the history
Split the global inode_wb_list_lock into a per-bdi_writeback list_lock,
as it's currently the most contended lock in the system for metadata
heavy workloads.  It won't help for single-filesystem workloads for
which we'll need the I/O-less balance_dirty_pages, but at least we
can dedicate a cpu to spinning on each bdi now for larger systems.

Based on earlier patches from Nick Piggin and Dave Chinner.

It reduces lock contentions to 1/4 in this test case:
10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram

lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
vanilla 2.6.39-rc3:
                      inode_wb_list_lock:         42590          44433           0.12         147.74      144127.35         252274         886792           0.08         121.34      917211.23
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             34          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock          12893          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock          10702          [<ffffffff8115afef>] writeback_single_inode+0x16d/0x20a
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             19          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock           5550          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock           8511          [<ffffffff8115b4ad>] writeback_sb_inodes+0x10f/0x157

2.6.39-rc3 + patch:
                &(&wb->list_lock)->rlock:         11383          11657           0.14         151.69       40429.51          90825         527918           0.11         145.90      556843.37
                ------------------------
                &(&wb->list_lock)->rlock             10          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           1493          [<ffffffff8115b1ed>] writeback_inodes_wb+0x3d/0x150
                &(&wb->list_lock)->rlock           3652          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f
                &(&wb->list_lock)->rlock           1412          [<ffffffff8115a38e>] writeback_single_inode+0x17f/0x223
                ------------------------
                &(&wb->list_lock)->rlock              3          [<ffffffff8110b5af>] bdi_lock_two+0x46/0x4b
                &(&wb->list_lock)->rlock              6          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           2061          [<ffffffff8115af97>] __mark_inode_dirty+0x173/0x1cf
                &(&wb->list_lock)->rlock           2629          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f

hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old
akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
  • Loading branch information
Christoph Hellwig authored and Wu Fengguang committed Jun 8, 2011
1 parent 424b351 commit f758eea
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 68 deletions.
16 changes: 10 additions & 6 deletions fs/block_dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
}

EXPORT_SYMBOL(I_BDEV);

/*
* move the inode from it's current bdi to the a new bdi. if the inode is dirty
* we need to move it onto the dirty list of @dst so that the inode is always
* on the right list.
* Move the inode from its current bdi to a new bdi. If the inode is dirty we
* need to move it onto the dirty list of @dst so that the inode is always on
* the right list.
*/
static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
spin_lock(&inode_wb_list_lock);
struct backing_dev_info *old = inode->i_data.backing_dev_info;

if (unlikely(dst == old)) /* deadlock avoidance */
return;
bdi_lock_two(&old->wb, &dst->wb);
spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
if (inode->i_state & I_DIRTY)
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock);
spin_unlock(&old->wb.list_lock);
spin_unlock(&dst->wb.list_lock);
}

static sector_t max_block(struct block_device *bdev)
Expand Down
97 changes: 49 additions & 48 deletions fs/fs-writeback.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
*/
void inode_wb_list_del(struct inode *inode)
{
spin_lock(&inode_wb_list_lock);
struct backing_dev_info *bdi = inode_to_bdi(inode);

spin_lock(&bdi->wb.list_lock);
list_del_init(&inode->i_wb_list);
spin_unlock(&inode_wb_list_lock);
spin_unlock(&bdi->wb.list_lock);
}


/*
* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
* furthest end of its superblock's dirty-inode list.
Expand All @@ -196,11 +197,9 @@ void inode_wb_list_del(struct inode *inode)
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
static void redirty_tail(struct inode *inode)
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;

assert_spin_locked(&inode_wb_list_lock);
assert_spin_locked(&wb->list_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;

Expand All @@ -214,19 +213,17 @@ static void redirty_tail(struct inode *inode)
/*
* requeue inode for re-scanning after bdi->b_io list is exhausted.
*/
static void requeue_io(struct inode *inode)
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;

assert_spin_locked(&inode_wb_list_lock);
assert_spin_locked(&wb->list_lock);
list_move(&inode->i_wb_list, &wb->b_more_io);
}

static void inode_sync_complete(struct inode *inode)
{
/*
* Prevent speculative execution through
* spin_unlock(&inode_wb_list_lock);
* spin_unlock(&wb->list_lock);
*/

smp_mb();
Expand Down Expand Up @@ -302,7 +299,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
*/
static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
assert_spin_locked(&inode_wb_list_lock);
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
}
Expand All @@ -317,23 +314,24 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
/*
* Wait for writeback on an inode to complete.
*/
static void inode_wait_for_writeback(struct inode *inode)
static void inode_wait_for_writeback(struct inode *inode,
struct bdi_writeback *wb)
{
DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
wait_queue_head_t *wqh;

wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
while (inode->i_state & I_SYNC) {
spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock);
spin_unlock(&wb->list_lock);
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
spin_lock(&inode_wb_list_lock);
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
}
}

/*
* Write out an inode's dirty pages. Called under inode_wb_list_lock and
* Write out an inode's dirty pages. Called under wb->list_lock and
* inode->i_lock. Either the caller has an active reference on the inode or
* the inode has I_WILL_FREE set.
*
Expand All @@ -344,13 +342,14 @@ static void inode_wait_for_writeback(struct inode *inode)
* livelocks, etc.
*/
static int
writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
unsigned dirty;
int ret;

assert_spin_locked(&inode_wb_list_lock);
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);

if (!atomic_read(&inode->i_count))
Expand All @@ -368,14 +367,14 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* completed a full scan of b_io.
*/
if (wbc->sync_mode != WB_SYNC_ALL) {
requeue_io(inode);
requeue_io(inode, wb);
return 0;
}

/*
* It's a data-integrity sync. We must wait.
*/
inode_wait_for_writeback(inode);
inode_wait_for_writeback(inode, wb);
}

BUG_ON(inode->i_state & I_SYNC);
Expand All @@ -384,7 +383,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_state |= I_SYNC;
inode->i_state &= ~I_DIRTY_PAGES;
spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock);
spin_unlock(&wb->list_lock);

ret = do_writepages(mapping, wbc);

Expand Down Expand Up @@ -415,7 +414,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
ret = err;
}

spin_lock(&inode_wb_list_lock);
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & I_FREEING)) {
Expand All @@ -438,7 +437,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/*
* slice used up: queue for next turn
*/
requeue_io(inode);
requeue_io(inode, wb);
} else {
/*
* Writeback blocked by something other than
Expand All @@ -447,7 +446,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* retrying writeback of the dirty page/inode
* that cannot be performed immediately.
*/
redirty_tail(inode);
redirty_tail(inode, wb);
}
} else if (inode->i_state & I_DIRTY) {
/*
Expand All @@ -456,7 +455,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* submission or metadata updates after data IO
* completion.
*/
redirty_tail(inode);
redirty_tail(inode, wb);
} else {
/*
* The inode is clean. At this point we either have
Expand Down Expand Up @@ -521,7 +520,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
* superblock, move all inodes not belonging
* to it back onto the dirty list.
*/
redirty_tail(inode);
redirty_tail(inode, wb);
continue;
}

Expand All @@ -541,7 +540,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
requeue_io(inode);
requeue_io(inode, wb);
continue;
}

Expand All @@ -557,19 +556,19 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
__iget(inode);

pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
writeback_single_inode(inode, wb, wbc);
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
redirty_tail(inode);
redirty_tail(inode, wb);
}
spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock);
spin_unlock(&wb->list_lock);
iput(inode);
cond_resched();
spin_lock(&inode_wb_list_lock);
spin_lock(&wb->list_lock);
if (wbc->nr_to_write <= 0) {
wbc->more_io = 1;
return 1;
Expand All @@ -588,7 +587,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,

if (!wbc->wb_start)
wbc->wb_start = jiffies; /* livelock avoidance */
spin_lock(&inode_wb_list_lock);
spin_lock(&wb->list_lock);

if (list_empty(&wb->b_io))
queue_io(wb, wbc->older_than_this);
Expand All @@ -598,7 +597,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
struct super_block *sb = inode->i_sb;

if (!pin_sb_for_writeback(sb)) {
requeue_io(inode);
requeue_io(inode, wb);
continue;
}
ret = writeback_sb_inodes(sb, wb, wbc, false);
Expand All @@ -607,7 +606,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
if (ret)
break;
}
spin_unlock(&inode_wb_list_lock);
spin_unlock(&wb->list_lock);
/* Leave any unwritten inodes on b_io */
}

Expand All @@ -616,11 +615,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
{
WARN_ON(!rwsem_is_locked(&sb->s_umount));

spin_lock(&inode_wb_list_lock);
spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io))
queue_io(wb, wbc->older_than_this);
writeback_sb_inodes(sb, wb, wbc, true);
spin_unlock(&inode_wb_list_lock);
spin_unlock(&wb->list_lock);
}

/*
Expand Down Expand Up @@ -762,15 +761,15 @@ static long wb_writeback(struct bdi_writeback *wb,
* become available for writeback. Otherwise
* we'll just busyloop.
*/
spin_lock(&inode_wb_list_lock);
spin_lock(&wb->list_lock);
if (!list_empty(&wb->b_more_io)) {
inode = wb_inode(wb->b_more_io.prev);
trace_wbc_writeback_wait(&wbc, wb->bdi);
spin_lock(&inode->i_lock);
inode_wait_for_writeback(inode);
inode_wait_for_writeback(inode, wb);
spin_unlock(&inode->i_lock);
}
spin_unlock(&inode_wb_list_lock);
spin_unlock(&wb->list_lock);
}

return wrote;
Expand Down Expand Up @@ -1104,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}

spin_unlock(&inode->i_lock);
spin_lock(&inode_wb_list_lock);
spin_lock(&bdi->wb.list_lock);
inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
spin_unlock(&inode_wb_list_lock);
spin_unlock(&bdi->wb.list_lock);

if (wakeup_bdi)
bdi_wakeup_thread_delayed(bdi);
Expand Down Expand Up @@ -1309,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb);
*/
int write_inode_now(struct inode *inode, int sync)
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
int ret;
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
Expand All @@ -1321,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync)
wbc.nr_to_write = 0;

might_sleep();
spin_lock(&inode_wb_list_lock);
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
ret = writeback_single_inode(inode, &wbc);
ret = writeback_single_inode(inode, wb, &wbc);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock);
spin_unlock(&wb->list_lock);
if (sync)
inode_sync_wait(inode);
return ret;
Expand All @@ -1345,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now);
*/
int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
int ret;

spin_lock(&inode_wb_list_lock);
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
ret = writeback_single_inode(inode, wbc);
ret = writeback_single_inode(inode, wb, wbc);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_wb_list_lock);
spin_unlock(&wb->list_lock);
return ret;
}
EXPORT_SYMBOL(sync_inode);
Expand Down
5 changes: 2 additions & 3 deletions fs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
* inode_lru, inode->i_lru
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
* inode_wb_list_lock protects:
* bdi->wb.list_lock protects:
* bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
Expand All @@ -48,7 +48,7 @@
* inode->i_lock
* inode_lru_lock
*
* inode_wb_list_lock
* bdi->wb.list_lock
* inode->i_lock
*
* inode_hash_lock
Expand All @@ -68,7 +68,6 @@ static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock);

__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);

/*
* iprune_sem provides exclusion between the icache shrinking and the
Expand Down
Loading

0 comments on commit f758eea

Please sign in to comment.