From 170f37d6aa6ad4582eefd7459015de79e244536e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 3 May 2022 00:09:31 -0400
Subject: [PATCH 1/2] block: Do not call folio_next() on an unreferenced folio

It is unsafe to call folio_next() on a folio unless you hold a reference
on it that prevents it from being split or freed.  After returning
from the iterator, iomap calls folio_end_writeback() which may drop
the last reference to the page, or allow the page to be split.  If that
happens, the iterator will not advance far enough through the bio_vec,
leading to assertion failures like the BUG() in folio_end_writeback()
that checks we're not trying to end writeback on a page not currently
under writeback.  Other assertion failures were also seen, but they're
all explained by this one bug.

Fix the bug by remembering where the next folio starts before returning
from the iterator.  There are other ways of fixing this bug, but this
seems the simplest.

Reported-by: Darrick J. Wong <djwong@kernel.org>
Tested-by: Darrick J. Wong <djwong@kernel.org>
Reported-by: Brian Foster <bfoster@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/bio.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 278cc81cc1e7f..00450fd86bb43 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -269,6 +269,7 @@ struct folio_iter {
 	size_t offset;
 	size_t length;
 	/* private: for use by the iterator */
+	struct folio *_next;
 	size_t _seg_count;
 	int _i;
 };
@@ -283,6 +284,7 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio,
 			PAGE_SIZE * (bvec->bv_page - &fi->folio->page);
 	fi->_seg_count = bvec->bv_len;
 	fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count);
+	fi->_next = folio_next(fi->folio);
 	fi->_i = i;
 }
 
@@ -290,9 +292,10 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
 {
 	fi->_seg_count -= fi->length;
 	if (fi->_seg_count) {
-		fi->folio = folio_next(fi->folio);
+		fi->folio = fi->_next;
 		fi->offset = 0;
 		fi->length = min(folio_size(fi->folio), fi->_seg_count);
+		fi->_next = folio_next(fi->folio);
 	} else if (fi->_i + 1 < bio->bi_vcnt) {
 		bio_first_folio(fi, bio, fi->_i + 1);
 	} else {

From b9ff43dd27434dbd850b908e2e0e1f6e794efd9b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 27 Apr 2022 17:01:28 -0400
Subject: [PATCH 2/2] mm/readahead: Fix readahead with large folios

Reading 100KB chunks from a big file (eg dd bs=100K) leads to poor
readahead behaviour.  Studying the traces in detail, I noticed two
problems.

The first is that we were setting the readahead flag on the folio which
contains the last byte read from the block.  This is wrong because we
will trigger readahead at the end of the read without waiting to see
if a subsequent read is going to use the pages we just read.  Instead,
we need to set the readahead flag on the first folio _after_ the one
which contains the last byte that we're reading.

The second is that we were looking for the index of the folio with the
readahead flag set to exactly match the start + size - async_size.
If we've rounded this, either down (as previously) or up (as now),
we'll think we hit a folio marked as readahead by a different read,
and try to read the wrong pages.  So round the expected index to the
order of the folio we hit.

Reported-by: Guo Xuenan <guoxuenan@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/readahead.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 8e3775829513e..4a60cdb64262a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -474,7 +474,8 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
 
 	if (!folio)
 		return -ENOMEM;
-	if (mark - index < (1UL << order))
+	mark = round_up(mark, 1UL << order);
+	if (index == mark)
 		folio_set_readahead(folio);
 	err = filemap_add_folio(ractl->mapping, folio, index, gfp);
 	if (err)
@@ -555,8 +556,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
 	struct file_ra_state *ra = ractl->ra;
 	unsigned long max_pages = ra->ra_pages;
 	unsigned long add_pages;
-	unsigned long index = readahead_index(ractl);
-	pgoff_t prev_index;
+	pgoff_t index = readahead_index(ractl);
+	pgoff_t expected, prev_index;
+	unsigned int order = folio ? folio_order(folio) : 0;
 
 	/*
 	 * If the request exceeds the readahead window, allow the read to
@@ -575,8 +577,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
 	 * It's the expected callback index, assume sequential access.
 	 * Ramp up sizes, and push forward the readahead window.
 	 */
-	if ((index == (ra->start + ra->size - ra->async_size) ||
-	     index == (ra->start + ra->size))) {
+	expected = round_up(ra->start + ra->size - ra->async_size,
+			1UL << order);
+	if (index == expected || index == (ra->start + ra->size)) {
 		ra->start += ra->size;
 		ra->size = get_next_ra_size(ra, max_pages);
 		ra->async_size = ra->size;
@@ -662,7 +665,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
 	}
 
 	ractl->_index = ra->start;
-	page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0);
+	page_cache_ra_order(ractl, ra, order);
 }
 
 void page_cache_sync_ra(struct readahead_control *ractl,