From 94e8c95ccba8bc25b6385b8c2ba1b9cd90e86de6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 13 Nov 2020 20:51:28 +0800
Subject: [PATCH] btrfs: add structure to keep track of extent range in
 end_bio_extent_readpage

In end_bio_extent_readpage() we had a strange dance around
extent_start/extent_len.

Hidden behind the strange dance is, it's just calling
endio_readpage_release_extent() on each bvec range.

Here is an example to explain the original work flow:

  Bio is for inode 257, containing 2 pages, for range [1M, 1M+8K)

  end_bio_extent_extent_readpage() entered
  |- extent_start = 0;
  |- extent_end = 0;
  |- bio_for_each_segment_all() {
  |  |- /* Got the 1st bvec */
  |  |- start = SZ_1M;
  |  |- end = SZ_1M + SZ_4K - 1;
  |  |- update = 1;
  |  |- if (extent_len == 0) {
  |  |  |- extent_start = start; /* SZ_1M */
  |  |  |- extent_len = end + 1 - start; /* SZ_1M */
  |  |  }
  |  |
  |  |- /* Got the 2nd bvec */
  |  |- start = SZ_1M + 4K;
  |  |- end = SZ_1M + 4K - 1;
  |  |- update = 1;
  |  |- if (extent_start + extent_len == start) {
  |  |  |- extent_len += end + 1 - start; /* SZ_8K */
  |  |  }
  |  } /* All bio vec iterated */
  |
  |- if (extent_len) {
     |- endio_readpage_release_extent(tree, extent_start, extent_len,
				      update);
	/* extent_start == SZ_1M, extent_len == SZ_8K, uptodate = 1 */

As the above flow shows, the existing code in end_bio_extent_readpage()
is accumulates extent_start/extent_len, and when the contiguous range
stops, calls endio_readpage_release_extent() for the range.

However current behavior has something not really considered:

- The inode can change
  For bio, its pages don't need to have contiguous page_offset.
  This means, even pages from different inodes can be packed into one
  bio.

- bvec cross page boundary
  There is a feature called multi-page bvec, where bvec->bv_len can go
  beyond bvec->bv_page boundary.

- Poor readability

This patch will address the problem:

- Introduce a proper structure, processed_extent, to record processed
  extent range

- Integrate inode/start/end/uptodate check into
  endio_readpage_release_extent()

- Add more comment on each step.
  This should greatly improve the readability, now in
  end_bio_extent_readpage() there are only two
  endio_readpage_release_extent() calls.

- Add inode check for contiguity
  Now we also ensure the inode is the same one before checking if the
  range is contiguous.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 107 +++++++++++++++++++++++++++++--------------
 1 file changed, 73 insertions(+), 34 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 07db1af9d7d50..519f74502a3ed 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2769,16 +2769,77 @@ static void end_bio_extent_writepage(struct bio *bio)
 	bio_put(bio);
 }
 
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
-			      int uptodate)
+/*
+ * Record previously processed extent range
+ *
+ * For endio_readpage_release_extent() to handle a full extent range, reducing
+ * the extent io operations.
+ */
+struct processed_extent {
+	struct btrfs_inode *inode;
+	/* Start of the range in @inode */
+	u64 start;
+	/* End of the range in in @inode */
+	u64 end;
+	bool uptodate;
+};
+
+/*
+ * Try to release processed extent range
+ *
+ * May not release the extent range right now if the current range is
+ * contiguous to processed extent.
+ *
+ * Will release processed extent when any of @inode, @uptodate, the range is
+ * no longer contiguous to the processed range.
+ *
+ * Passing @inode == NULL will force processed extent to be released.
+ */
+static void endio_readpage_release_extent(struct processed_extent *processed,
+			      struct btrfs_inode *inode, u64 start, u64 end,
+			      bool uptodate)
 {
 	struct extent_state *cached = NULL;
-	u64 end = start + len - 1;
+	struct extent_io_tree *tree;
+
+	/* The first extent, initialize @processed */
+	if (!processed->inode)
+		goto update;
 
-	if (uptodate && tree->track_uptodate)
-		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
-	unlock_extent_cached_atomic(tree, start, end, &cached);
+	/*
+	 * Contiguous to processed extent, just uptodate the end.
+	 *
+	 * Several things to notice:
+	 *
+	 * - bio can be merged as long as on-disk bytenr is contiguous
+	 *   This means we can have page belonging to other inodes, thus need to
+	 *   check if the inode still matches.
+	 * - bvec can contain range beyond current page for multi-page bvec
+	 *   Thus we need to do processed->end + 1 >= start check
+	 */
+	if (processed->inode == inode && processed->uptodate == uptodate &&
+	    processed->end + 1 >= start && end >= processed->end) {
+		processed->end = end;
+		return;
+	}
+
+	tree = &processed->inode->io_tree;
+	/*
+	 * Now we don't have range contiguous to the processed range, release
+	 * the processed range now.
+	 */
+	if (processed->uptodate && tree->track_uptodate)
+		set_extent_uptodate(tree, processed->start, processed->end,
+				    &cached, GFP_ATOMIC);
+	unlock_extent_cached_atomic(tree, processed->start, processed->end,
+				    &cached);
+
+update:
+	/* Update processed to current range */
+	processed->inode = inode;
+	processed->start = start;
+	processed->end = end;
+	processed->uptodate = uptodate;
 }
 
 /*
@@ -2798,12 +2859,11 @@ static void end_bio_extent_readpage(struct bio *bio)
 	int uptodate = !bio->bi_status;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 	struct extent_io_tree *tree, *failure_tree;
+	struct processed_extent processed = { 0 };
 	u64 offset = 0;
 	u64 start;
 	u64 end;
 	u64 len;
-	u64 extent_start = 0;
-	u64 extent_len = 0;
 	int mirror;
 	int ret;
 	struct bvec_iter_all iter_all;
@@ -2912,32 +2972,11 @@ static void end_bio_extent_readpage(struct bio *bio)
 		unlock_page(page);
 		offset += len;
 
-		if (unlikely(!uptodate)) {
-			if (extent_len) {
-				endio_readpage_release_extent(tree,
-							      extent_start,
-							      extent_len, 1);
-				extent_start = 0;
-				extent_len = 0;
-			}
-			endio_readpage_release_extent(tree, start,
-						      end - start + 1, 0);
-		} else if (!extent_len) {
-			extent_start = start;
-			extent_len = end + 1 - start;
-		} else if (extent_start + extent_len == start) {
-			extent_len += end + 1 - start;
-		} else {
-			endio_readpage_release_extent(tree, extent_start,
-						      extent_len, uptodate);
-			extent_start = start;
-			extent_len = end + 1 - start;
-		}
+		endio_readpage_release_extent(&processed, BTRFS_I(inode),
+					      start, end, uptodate);
 	}
-
-	if (extent_len)
-		endio_readpage_release_extent(tree, extent_start, extent_len,
-					      uptodate);
+	/* Release the last extent */
+	endio_readpage_release_extent(&processed, NULL, 0, 0, false);
 	btrfs_io_bio_free_csum(io_bio);
 	bio_put(bio);
 }