Skip to content

Commit

Permalink
xfs: convert inode cache lookups to use RCU locking
Browse files Browse the repository at this point in the history
With delayed logging greatly increasing the sustained parallelism of inode
operations, the inode cache locking is showing significant read vs write
contention when inode reclaim runs at the same time as lookups. There is
also a lot more write lock acquistions than there are read locks (4:1 ratio)
so the read locking is not really buying us much in the way of parallelism.

To avoid the read vs write contention, change the cache to use RCU locking on
the read side. To avoid needing to RCU free every single inode, use the built
in slab RCU freeing mechanism. This requires us to be able to detect lookups of
freed inodes, so enѕure that ever freed inode has an inode number of zero and
the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit
lookup path, but also add a check for a zero inode number as well.

We canthen convert all the read locking lockups to use RCU read side locking
and hence remove all read side locking.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
  • Loading branch information
Dave Chinner authored and Dave Chinner committed Dec 17, 2010
1 parent d95b7aa commit 1a3e8f3
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 42 deletions.
84 changes: 66 additions & 18 deletions fs/xfs/linux-2.6/xfs_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
{
struct inode *inode = VFS_I(ip);

ASSERT(rcu_read_lock_held());

/*
* check for stale RCU freed inode
*
* If the inode has been reallocated, it doesn't matter if it's not in
* the AG we are walking - we are walking for writeback, so if it
* passes all the "valid inode" checks and is dirty, then we'll write
* it back anyway. If it has been reallocated and still being
* initialised, the XFS_INEW check below will catch it.
*/
spin_lock(&ip->i_flags_lock);
if (!ip->i_ino)
goto out_unlock_noent;

/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
goto out_unlock_noent;
spin_unlock(&ip->i_flags_lock);

/* nothing to sync during shutdown */
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return EFSCORRUPTED;

/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
return ENOENT;

/* If we can't grab the inode, it must on it's way to reclaim. */
if (!igrab(inode))
return ENOENT;
Expand All @@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(

/* inode is valid */
return 0;

out_unlock_noent:
spin_unlock(&ip->i_flags_lock);
return ENOENT;
}

STATIC int
Expand All @@ -98,12 +118,12 @@ xfs_inode_ag_walk(
int error = 0;
int i;

read_lock(&pag->pag_ici_lock);
rcu_read_lock();
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)batch, first_index,
XFS_LOOKUP_BATCH);
if (!nr_found) {
read_unlock(&pag->pag_ici_lock);
rcu_read_unlock();
break;
}

Expand All @@ -118,18 +138,26 @@ xfs_inode_ag_walk(
batch[i] = NULL;

/*
* Update the index for the next lookup. Catch overflows
* into the next AG range which can occur if we have inodes
* in the last block of the AG and we are currently
* pointing to the last inode.
* Update the index for the next lookup. Catch
* overflows into the next AG range which can occur if
* we have inodes in the last block of the AG and we
* are currently pointing to the last inode.
*
* Because we may see inodes that are from the wrong AG
* due to RCU freeing and reallocation, only update the
* index if it lies in this AG. It was a race that lead
* us to see this inode, so another lookup from the
* same index will not find it again.
*/
if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
done = 1;
}

/* unlock now we've grabbed the inodes. */
read_unlock(&pag->pag_ici_lock);
rcu_read_unlock();

for (i = 0; i < nr_found; i++) {
if (!batch[i])
Expand Down Expand Up @@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
struct xfs_inode *ip,
int flags)
{
ASSERT(rcu_read_lock_held());

/* quick check for stale RCU freed inode */
if (!ip->i_ino)
return 1;

/*
* do some unlocked checks first to avoid unnecceary lock traffic.
* do some unlocked checks first to avoid unnecessary lock traffic.
* The first is a flush lock check, the second is a already in reclaim
* check. Only do these checks if we are not going to block on locks.
*/
Expand All @@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
* The radix tree lock here protects a thread in xfs_iget from racing
* with us starting reclaim on the inode. Once we have the
* XFS_IRECLAIM flag set it will not touch us.
*
* Due to RCU lookup, we may find inodes that have been freed and only
* have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
* aren't candidates for reclaim at all, so we must check the
* XFS_IRECLAIMABLE is set first before proceeding to reclaim.
*/
spin_lock(&ip->i_flags_lock);
ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
/* ignore as it is already under reclaim */
if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
__xfs_iflags_test(ip, XFS_IRECLAIM)) {
/* not a reclaim candidate. */
spin_unlock(&ip->i_flags_lock);
return 1;
}
Expand Down Expand Up @@ -864,14 +902,14 @@ xfs_reclaim_inodes_ag(
struct xfs_inode *batch[XFS_LOOKUP_BATCH];
int i;

write_lock(&pag->pag_ici_lock);
rcu_read_lock();
nr_found = radix_tree_gang_lookup_tag(
&pag->pag_ici_root,
(void **)batch, first_index,
XFS_LOOKUP_BATCH,
XFS_ICI_RECLAIM_TAG);
if (!nr_found) {
write_unlock(&pag->pag_ici_lock);
rcu_read_unlock();
break;
}

Expand All @@ -891,14 +929,24 @@ xfs_reclaim_inodes_ag(
* occur if we have inodes in the last block of
* the AG and we are currently pointing to the
* last inode.
*
* Because we may see inodes that are from the
* wrong AG due to RCU freeing and
* reallocation, only update the index if it
* lies in this AG. It was a race that lead us
* to see this inode, so another lookup from
* the same index will not find it again.
*/
if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
pag->pag_agno)
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
done = 1;
}

/* unlock now we've grabbed the inodes. */
write_unlock(&pag->pag_ici_lock);
rcu_read_unlock();

for (i = 0; i < nr_found; i++) {
if (!batch[i])
Expand Down
47 changes: 35 additions & 12 deletions fs/xfs/xfs_iget.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
ASSERT(ip->i_ino == 0);

mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
Expand All @@ -98,9 +99,6 @@ xfs_inode_alloc(
ip->i_size = 0;
ip->i_new_size = 0;

/* prevent anyone from using this yet */
VFS_I(ip)->i_state = I_NEW;

return ip;
}

Expand Down Expand Up @@ -159,6 +157,16 @@ xfs_inode_free(
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));

/*
* Because we use RCU freeing we need to ensure the inode always
* appears to be reclaimed with an invalid inode number when in the
* free state. The ip->i_flags_lock provides the barrier against lookup
* races.
*/
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;
ip->i_ino = 0;
spin_unlock(&ip->i_flags_lock);
call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
}

Expand All @@ -169,14 +177,29 @@ static int
xfs_iget_cache_hit(
struct xfs_perag *pag,
struct xfs_inode *ip,
xfs_ino_t ino,
int flags,
int lock_flags) __releases(pag->pag_ici_lock)
int lock_flags) __releases(RCU)
{
struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
int error;

/*
* check for re-use of an inode within an RCU grace period due to the
* radix tree nodes not being updated yet. We monitor for this by
* setting the inode number to zero before freeing the inode structure.
* If the inode has been reallocated and set up, then the inode number
* will not match, so check for that, too.
*/
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != ino) {
trace_xfs_iget_skip(ip);
XFS_STATS_INC(xs_ig_frecycle);
error = EAGAIN;
goto out_error;
}


/*
* If we are racing with another cache hit that is currently
Expand Down Expand Up @@ -219,15 +242,15 @@ xfs_iget_cache_hit(
ip->i_flags |= XFS_IRECLAIM;

spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
rcu_read_unlock();

error = -inode_init_always(mp->m_super, inode);
if (error) {
/*
* Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list.
*/
read_lock(&pag->pag_ici_lock);
rcu_read_lock();
spin_lock(&ip->i_flags_lock);

ip->i_flags &= ~XFS_INEW;
Expand Down Expand Up @@ -261,7 +284,7 @@ xfs_iget_cache_hit(

/* We've got a live one. */
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
rcu_read_unlock();
trace_xfs_iget_hit(ip);
}

Expand All @@ -275,7 +298,7 @@ xfs_iget_cache_hit(

out_error:
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
rcu_read_unlock();
return error;
}

Expand Down Expand Up @@ -397,7 +420,7 @@ xfs_iget(
xfs_agino_t agino;

/* reject inode numbers outside existing AGs */
if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return EINVAL;

/* get the perag structure and ensure that it's inode capable */
Expand All @@ -406,15 +429,15 @@ xfs_iget(

again:
error = 0;
read_lock(&pag->pag_ici_lock);
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino);

if (ip) {
error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
if (error)
goto out_error_or_again;
} else {
read_unlock(&pag->pag_ici_lock);
rcu_read_unlock();
XFS_STATS_INC(xs_ig_missed);

error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
Expand Down
Loading

0 comments on commit 1a3e8f3

Please sign in to comment.