---

yaml --- r: 229038 b: refs/heads/master c: 1a3e8f3 h: refs/heads/master v: v3
git-mirror · Dec 17, 2010 · dd8f120 · dd8f120
1 parent 52d604b
commit dd8f120
Show file tree

Hide file tree

Showing 4 changed files with 142 additions and 43 deletions.
diff --git a/[refs] b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: d95b7aaf9ab6738bef1ebcc52ab66563085e44ac
+refs/heads/master: 1a3e8f3da09c7082d25b512a0ffe569391e4c09a
diff --git a/trunk/fs/xfs/linux-2.6/xfs_sync.c b/trunk/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
 	struct inode		*inode = VFS_I(ip);
 
+	ASSERT(rcu_read_lock_held());
+
+	/*
+	 * check for stale RCU freed inode
+	 *
+	 * If the inode has been reallocated, it doesn't matter if it's not in
+	 * the AG we are walking - we are walking for writeback, so if it
+	 * passes all the "valid inode" checks and is dirty, then we'll write
+	 * it back anyway.  If it has been reallocated and still being
+	 * initialised, the XFS_INEW check below will catch it.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		goto out_unlock_noent;
+	spin_unlock(&ip->i_flags_lock);
+
 	/* nothing to sync during shutdown */
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return EFSCORRUPTED;
 
-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-		return ENOENT;
-
 	/* If we can't grab the inode, it must on it's way to reclaim. */
 	if (!igrab(inode))
 		return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
 
 	/* inode is valid */
 	return 0;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return ENOENT;
 }
 
 STATIC int
@@ -98,12 +118,12 @@ xfs_inode_ag_walk(
 		int		error = 0;
 		int		i;
 
-		read_lock(&pag->pag_ici_lock);
+		rcu_read_lock();
 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 					(void **)batch, first_index,
 					XFS_LOOKUP_BATCH);
 		if (!nr_found) {
-			read_unlock(&pag->pag_ici_lock);
+			rcu_read_unlock();
 			break;
 		}
 
@@ -118,18 +138,26 @@ xfs_inode_ag_walk(
 				batch[i] = NULL;
 
 			/*
-			 * Update the index for the next lookup. Catch overflows
-			 * into the next AG range which can occur if we have inodes
-			 * in the last block of the AG and we are currently
-			 * pointing to the last inode.
+			 * Update the index for the next lookup. Catch
+			 * overflows into the next AG range which can occur if
+			 * we have inodes in the last block of the AG and we
+			 * are currently pointing to the last inode.
+			 *
+			 * Because we may see inodes that are from the wrong AG
+			 * due to RCU freeing and reallocation, only update the
+			 * index if it lies in this AG. It was a race that lead
+			 * us to see this inode, so another lookup from the
+			 * same index will not find it again.
 			 */
+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+				continue;
 			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 				done = 1;
 		}
 
 		/* unlock now we've grabbed the inodes. */
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 
 		for (i = 0; i < nr_found; i++) {
 			if (!batch[i])
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
 	struct xfs_inode	*ip,
 	int			flags)
 {
+	ASSERT(rcu_read_lock_held());
+
+	/* quick check for stale RCU freed inode */
+	if (!ip->i_ino)
+		return 1;
 
 	/*
-	 * do some unlocked checks first to avoid unnecceary lock traffic.
+	 * do some unlocked checks first to avoid unnecessary lock traffic.
 	 * The first is a flush lock check, the second is a already in reclaim
 	 * check. Only do these checks if we are not going to block on locks.
 	 */
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
 	 * The radix tree lock here protects a thread in xfs_iget from racing
 	 * with us starting reclaim on the inode.  Once we have the
 	 * XFS_IRECLAIM flag set it will not touch us.
+	 *
+	 * Due to RCU lookup, we may find inodes that have been freed and only
+	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+	 * aren't candidates for reclaim at all, so we must check the
+	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
 	 */
 	spin_lock(&ip->i_flags_lock);
-	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		/* ignore as it is already under reclaim */
+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* not a reclaim candidate. */
 		spin_unlock(&ip->i_flags_lock);
 		return 1;
 	}
@@ -864,14 +902,14 @@ xfs_reclaim_inodes_ag(
 			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 			int	i;
 
-			write_lock(&pag->pag_ici_lock);
+			rcu_read_lock();
 			nr_found = radix_tree_gang_lookup_tag(
 					&pag->pag_ici_root,
 					(void **)batch, first_index,
 					XFS_LOOKUP_BATCH,
 					XFS_ICI_RECLAIM_TAG);
 			if (!nr_found) {
-				write_unlock(&pag->pag_ici_lock);
+				rcu_read_unlock();
 				break;
 			}
 
@@ -891,14 +929,24 @@ xfs_reclaim_inodes_ag(
 				 * occur if we have inodes in the last block of
 				 * the AG and we are currently pointing to the
 				 * last inode.
+				 *
+				 * Because we may see inodes that are from the
+				 * wrong AG due to RCU freeing and
+				 * reallocation, only update the index if it
+				 * lies in this AG. It was a race that lead us
+				 * to see this inode, so another lookup from
+				 * the same index will not find it again.
 				 */
+				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+								pag->pag_agno)
+					continue;
 				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 					done = 1;
 			}
 
 			/* unlock now we've grabbed the inodes. */
-			write_unlock(&pag->pag_ici_lock);
+			rcu_read_unlock();
 
 			for (i = 0; i < nr_found; i++) {
 				if (!batch[i])

diff --git a/trunk/fs/xfs/xfs_iget.c b/trunk/fs/xfs/xfs_iget.c
@@ -80,6 +80,7 @@ xfs_inode_alloc(
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
+	ASSERT(ip->i_ino == 0);
 
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
@@ -98,9 +99,6 @@ xfs_inode_alloc(
 	ip->i_size = 0;
 	ip->i_new_size = 0;
 
-	/* prevent anyone from using this yet */
-	VFS_I(ip)->i_state = I_NEW;
-
 	return ip;
 }
 
@@ -159,6 +157,16 @@ xfs_inode_free(
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always
+	 * appears to be reclaimed with an invalid inode number when in the
+	 * free state. The ip->i_flags_lock provides the barrier against lookup
+	 * races.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
 	call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
 }
 
@@ -169,14 +177,29 @@ static int
 xfs_iget_cache_hit(
 	struct xfs_perag	*pag,
 	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
 	int			flags,
-	int			lock_flags) __releases(pag->pag_ici_lock)
+	int			lock_flags) __releases(RCU)
 {
 	struct inode		*inode = VFS_I(ip);
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;
 
+	/*
+	 * check for re-use of an inode within an RCU grace period due to the
+	 * radix tree nodes not being updated yet. We monitor for this by
+	 * setting the inode number to zero before freeing the inode structure.
+	 * If the inode has been reallocated and set up, then the inode number
+	 * will not match, so check for that, too.
+	 */
 	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
 
 	/*
 	 * If we are racing with another cache hit that is currently
@@ -219,15 +242,15 @@ xfs_iget_cache_hit(
 		ip->i_flags |= XFS_IRECLAIM;
 
 		spin_unlock(&ip->i_flags_lock);
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 
 		error = -inode_init_always(mp->m_super, inode);
 		if (error) {
 			/*
 			 * Re-initializing the inode failed, and we are in deep
 			 * trouble.  Try to re-add it to the reclaim list.
 			 */
-			read_lock(&pag->pag_ici_lock);
+			rcu_read_lock();
 			spin_lock(&ip->i_flags_lock);
 
 			ip->i_flags &= ~XFS_INEW;
@@ -261,7 +284,7 @@ xfs_iget_cache_hit(
 
 		/* We've got a live one. */
 		spin_unlock(&ip->i_flags_lock);
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 		trace_xfs_iget_hit(ip);
 	}
 
@@ -275,7 +298,7 @@ xfs_iget_cache_hit(
 
 out_error:
 	spin_unlock(&ip->i_flags_lock);
-	read_unlock(&pag->pag_ici_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -397,7 +420,7 @@ xfs_iget(
 	xfs_agino_t	agino;
 
 	/* reject inode numbers outside existing AGs */
-	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 		return EINVAL;
 
 	/* get the perag structure and ensure that it's inode capable */
@@ -406,15 +429,15 @@ xfs_iget(
 
 again:
 	error = 0;
-	read_lock(&pag->pag_ici_lock);
+	rcu_read_lock();
 	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 
 	if (ip) {
-		error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
 		if (error)
 			goto out_error_or_again;
 	} else {
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 		XFS_STATS_INC(xs_ig_missed);
 
 		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,