From ad56255feea91862de033fb83eba8833ff8b132d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 11 Jan 2011 10:22:40 +1100
Subject: [PATCH] --- yaml --- r: 231261 b: refs/heads/master c:
 eda77982729b7170bdc9e8855f0682edf322d277 h: refs/heads/master i:   231259:
 84d029ebc59c864628a004a78aba668b59fc7a6c v: v3

---
 [refs]                            |  2 +-
 trunk/fs/xfs/linux-2.6/xfs_file.c | 38 +++++++++++++++++++++++--------
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/[refs] b/[refs]
index 6a9fb4ed139d..c77dcde448ad 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 4d8d15812fd9bc96d0da11467d23e0373feae933
+refs/heads/master: eda77982729b7170bdc9e8855f0682edf322d277
diff --git a/trunk/fs/xfs/linux-2.6/xfs_file.c b/trunk/fs/xfs/linux-2.6/xfs_file.c
index 5863dd8f448c..ef51eb43e137 100644
--- a/trunk/fs/xfs/linux-2.6/xfs_file.c
+++ b/trunk/fs/xfs/linux-2.6/xfs_file.c
@@ -684,9 +684,24 @@ xfs_file_aio_write_checks(
  * xfs_file_dio_aio_write - handle direct IO writes
  *
  * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By spearating it from the buffered write path we remove all the tricky to
+ * By separating it from the buffered write path we remove all the tricky to
  * follow locking changes and looping.
  *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
  * Returns with locks held indicated by @iolock and errors indicated by
  * negative return values.
  */
@@ -706,6 +721,7 @@ xfs_file_dio_aio_write(
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			ret = 0;
 	size_t			count = ocount;
+	int			unaligned_io = 0;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
@@ -713,13 +729,10 @@ xfs_file_dio_aio_write(
 	if ((pos & target->bt_smask) || (count & target->bt_smask))
 		return -XFS_ERROR(EINVAL);
 
-	/*
-	 * For direct I/O, if there are cached pages or we're extending
-	 * the file, we need IOLOCK_EXCL until we're sure the bytes at
-	 * the new EOF have been zeroed and/or the cached pages are
-	 * flushed out.
-	 */
-	if (mapping->nrpages || pos > ip->i_size)
+	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+		unaligned_io = 1;
+
+	if (unaligned_io || mapping->nrpages || pos > ip->i_size)
 		*iolock = XFS_IOLOCK_EXCL;
 	else
 		*iolock = XFS_IOLOCK_SHARED;
@@ -737,8 +750,13 @@ xfs_file_dio_aio_write(
 			return ret;
 	}
 
-	if (*iolock == XFS_IOLOCK_EXCL) {
-		/* demote the lock now the cached pages are gone */
+	/*
+	 * If we are doing unaligned IO, wait for all other IO to drain,
+	 * otherwise demote the lock if we had to flush cached pages
+	 */
+	if (unaligned_io)
+		xfs_ioend_wait(ip);
+	else if (*iolock == XFS_IOLOCK_EXCL) {
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		*iolock = XFS_IOLOCK_SHARED;
 	}