From d0364f9490d7e098963ce4d146b51f9cd1199412 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 2 Aug 2021 14:43:43 -0700 Subject: [PATCH 1/7] iomap: simplify iomap_readpage_actor Now that the outstanding reads are counted in bytes, there is no need to use the low-level __bio_try_merge_page API, we can switch back to always using bio_add_page and simplify iomap_readpage_actor again. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/iomap/buffered-io.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 87ccb3438becd..712b6513a0c44 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -241,7 +241,6 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap_readpage_ctx *ctx = data; struct page *page = ctx->cur_page; struct iomap_page *iop; - bool same_page = false, is_contig = false; loff_t orig_pos = pos; unsigned poff, plen; sector_t sector; @@ -268,16 +267,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (iop) atomic_add(plen, &iop->read_bytes_pending); - /* Try to merge into a previous segment if we can */ sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) { - if (__bio_try_merge_page(ctx->bio, page, plen, poff, - &same_page)) - goto done; - is_contig = true; - } - - if (!is_contig || bio_full(ctx->bio, plen)) { + if (!ctx->bio || + bio_end_sector(ctx->bio) != sector || + bio_add_page(ctx->bio, page, plen, poff) != plen) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); @@ -301,9 +294,8 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); ctx->bio->bi_end_io = iomap_read_end_io; + __bio_add_page(ctx->bio, page, plen, poff); } - - bio_add_page(ctx->bio, page, plen, poff); done: /* * Move the caller beyond our range so that it keeps making progress. From c1b79f11f4ec27d3b3197a9584950a3be178c717 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 2 Aug 2021 14:43:43 -0700 Subject: [PATCH 2/7] iomap: simplify iomap_add_to_ioend Now that the outstanding writes are counted in bytes, there is no need to use the low-level __bio_try_merge_page API, we can switch back to always using bio_add_page and simply iomap_add_to_ioend again. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/iomap/buffered-io.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 712b6513a0c44..a463b41c0a166 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1251,7 +1251,6 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, sector_t sector = iomap_sector(&wpc->iomap, offset); unsigned len = i_blocksize(inode); unsigned poff = offset & (PAGE_SIZE - 1); - bool merged, same_page = false; if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { if (wpc->ioend) @@ -1259,19 +1258,13 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); } - merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, - &same_page); - if (iop) - atomic_add(len, &iop->write_bytes_pending); - - if (!merged) { - if (bio_full(wpc->ioend->io_bio, len)) { - wpc->ioend->io_bio = - iomap_chain_bio(wpc->ioend->io_bio); - } - bio_add_page(wpc->ioend->io_bio, page, len, poff); + if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) { + wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); + __bio_add_page(wpc->ioend->io_bio, page, len, poff); } + if (iop) + atomic_add(len, &iop->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, page, len); } From 69f4a26c1e0c7c5e5e77c5bd7b271743c124c545 Mon Sep 17 00:00:00 2001 From: Gao Xiang <hsiangkao@linux.alibaba.com> Date: Tue, 3 Aug 2021 09:38:22 -0700 Subject: [PATCH 3/7] iomap: support reading inline data from non-zero pos The existing inline data support only works for cases where the entire file is stored as inline data. For larger files, EROFS stores the initial blocks separately and the remainder of the file ("file tail") adjacent to the inode. Generalise inline data to allow reading the inline file tail. Tails may not cross a page boundary in memory. We currently have no filesystems that support tails and writing, so that case is currently disabled (see iomap_write_begin_inline). Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/iomap/buffered-io.c | 42 ++++++++++++++++++++++++++++++------------ fs/iomap/direct-io.c | 10 ++++++---- include/linux/iomap.h | 18 ++++++++++++++++++ 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index a463b41c0a166..1d31ff6bfea08 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -205,25 +205,32 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -static void -iomap_read_inline_data(struct inode *inode, struct page *page, +static int iomap_read_inline_data(struct inode *inode, struct page *page, struct iomap *iomap) { - size_t size = i_size_read(inode); + size_t size = i_size_read(inode) - iomap->offset; void *addr; if (PageUptodate(page)) - return; + return 0; - BUG_ON(page_has_private(page)); - BUG_ON(page->index); - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); + /* inline data must start page aligned in the file */ + if (WARN_ON_ONCE(offset_in_page(iomap->offset))) + return -EIO; + if (WARN_ON_ONCE(size > PAGE_SIZE - + offset_in_page(iomap->inline_data))) + return -EIO; + if (WARN_ON_ONCE(size > iomap->length)) + return -EIO; + if (WARN_ON_ONCE(page_has_private(page))) + return -EIO; addr = kmap_atomic(page); memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - size); kunmap_atomic(addr); SetPageUptodate(page); + return 0; } static inline bool iomap_block_needs_zeroing(struct inode *inode, @@ -246,8 +253,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, sector_t sector; if (iomap->type == IOMAP_INLINE) { - WARN_ON_ONCE(pos); - iomap_read_inline_data(inode, page, iomap); + int ret = iomap_read_inline_data(inode, page, iomap); + + if (ret) + return ret; return PAGE_SIZE; } @@ -581,6 +590,15 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, return 0; } +static int iomap_write_begin_inline(struct inode *inode, + struct page *page, struct iomap *srcmap) +{ + /* needs more work for the tailpacking case; disable for now */ + if (WARN_ON_ONCE(srcmap->offset != 0)) + return -EIO; + return iomap_read_inline_data(inode, page, srcmap); +} + static int iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep, struct iomap *iomap, struct iomap *srcmap) @@ -610,7 +628,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, } if (srcmap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, srcmap); + status = iomap_write_begin_inline(inode, page, srcmap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(page, pos, len, NULL, srcmap); else @@ -663,11 +681,11 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page, void *addr; WARN_ON_ONCE(!PageUptodate(page)); - BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); + BUG_ON(!iomap_inline_data_valid(iomap)); flush_dcache_page(page); addr = kmap_atomic(page); - memcpy(iomap->inline_data + pos, addr + pos, copied); + memcpy(iomap_inline_data(iomap, pos), addr + pos, copied); kunmap_atomic(addr); mark_inode_dirty(inode); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9398b8c31323b..41ccbfc9dc820 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -378,23 +378,25 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, struct iomap_dio *dio, struct iomap *iomap) { struct iov_iter *iter = dio->submit.iter; + void *inline_data = iomap_inline_data(iomap, pos); size_t copied; - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) + return -EIO; if (dio->flags & IOMAP_DIO_WRITE) { loff_t size = inode->i_size; if (pos > size) - memset(iomap->inline_data + size, 0, pos - size); - copied = copy_from_iter(iomap->inline_data + pos, length, iter); + memset(iomap_inline_data(iomap, size), 0, pos - size); + copied = copy_from_iter(inline_data, length, iter); if (copied) { if (pos + copied > size) i_size_write(inode, pos + copied); mark_inode_dirty(inode); } } else { - copied = copy_to_iter(iomap->inline_data + pos, length, iter); + copied = copy_to_iter(inline_data, length, iter); } dio->size += copied; return copied; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 479c1da3e2211..b8ec145b2975c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -97,6 +97,24 @@ iomap_sector(struct iomap *iomap, loff_t pos) return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } +/* + * Returns the inline data pointer for logical offset @pos. + */ +static inline void *iomap_inline_data(struct iomap *iomap, loff_t pos) +{ + return iomap->inline_data + pos - iomap->offset; +} + +/* + * Check if the mapping's length is within the valid range for inline data. + * This is used to guard against accessing data beyond the page inline_data + * points at. + */ +static inline bool iomap_inline_data_valid(struct iomap *iomap) +{ + return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data); +} + /* * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare * and page_done will be called for each page written to. This only applies to From b405435b419cb660455ba54fd47086216e58fed6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" <willy@infradead.org> Date: Mon, 2 Aug 2021 14:45:57 -0700 Subject: [PATCH 4/7] iomap: Support inline data with block size < page size Remove the restriction that inline data must start on a page boundary in a file. This allows, for example, the first 2KiB to be stored out of line and the trailing 30 bytes to be stored inline. Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/iomap/buffered-io.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1d31ff6bfea08..28cfa7fab023f 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -209,28 +209,26 @@ static int iomap_read_inline_data(struct inode *inode, struct page *page, struct iomap *iomap) { size_t size = i_size_read(inode) - iomap->offset; + size_t poff = offset_in_page(iomap->offset); void *addr; if (PageUptodate(page)) - return 0; + return PAGE_SIZE - poff; - /* inline data must start page aligned in the file */ - if (WARN_ON_ONCE(offset_in_page(iomap->offset))) - return -EIO; if (WARN_ON_ONCE(size > PAGE_SIZE - offset_in_page(iomap->inline_data))) return -EIO; if (WARN_ON_ONCE(size > iomap->length)) return -EIO; - if (WARN_ON_ONCE(page_has_private(page))) - return -EIO; + if (poff > 0) + iomap_page_create(inode, page); - addr = kmap_atomic(page); + addr = kmap_atomic(page) + poff; memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - size); + memset(addr + size, 0, PAGE_SIZE - poff - size); kunmap_atomic(addr); - SetPageUptodate(page); - return 0; + iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); + return PAGE_SIZE - poff; } static inline bool iomap_block_needs_zeroing(struct inode *inode, @@ -252,13 +250,8 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned poff, plen; sector_t sector; - if (iomap->type == IOMAP_INLINE) { - int ret = iomap_read_inline_data(inode, page, iomap); - - if (ret) - return ret; - return PAGE_SIZE; - } + if (iomap->type == IOMAP_INLINE) + return iomap_read_inline_data(inode, page, iomap); /* zero post-eof blocks as the page may be mapped */ iop = iomap_page_create(inode, page); @@ -593,10 +586,15 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, static int iomap_write_begin_inline(struct inode *inode, struct page *page, struct iomap *srcmap) { + int ret; + /* needs more work for the tailpacking case; disable for now */ if (WARN_ON_ONCE(srcmap->offset != 0)) return -EIO; - return iomap_read_inline_data(inode, page, srcmap); + ret = iomap_read_inline_data(inode, page, srcmap); + if (ret < 0) + return ret; + return 0; } static int From f1f264b4c134ee65cdadece7a20f3c0643602a4a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Mon, 2 Aug 2021 14:46:31 -0700 Subject: [PATCH 5/7] iomap: Fix some typos and bad grammar Fix some typos and bad grammar in buffered-io.c to make the comments easier to read. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/iomap/buffered-io.c | 72 +++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 28cfa7fab023f..c1c8cd41ea81e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -36,7 +36,7 @@ static inline struct iomap_page *to_iomap_page(struct page *page) { /* * per-block data is stored in the head page. Callers should - * not be dealing with tail pages (and if they are, they can + * not be dealing with tail pages, and if they are, they can * call thp_head() first. */ VM_BUG_ON_PGFLAGS(PageTail(page), page); @@ -98,7 +98,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, unsigned last = (poff + plen - 1) >> block_bits; /* - * If the block size is smaller than the page size we need to check the + * If the block size is smaller than the page size, we need to check the * per-block uptodate status and adjust the offset and length if needed * to avoid reading in already uptodate ranges. */ @@ -126,7 +126,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, } /* - * If the extent spans the block that contains the i_size we need to + * If the extent spans the block that contains the i_size, we need to * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ @@ -301,7 +301,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, done: /* * Move the caller beyond our range so that it keeps making progress. - * For that we have to include any leading non-uptodate ranges, but + * For that, we have to include any leading non-uptodate ranges, but * we can skip trailing ones as they will be handled in the next * iteration. */ @@ -338,9 +338,9 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readahead and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page, we always * return 0 and just mark the page as PageError on errors. This - * should be cleaned up all through the stack eventually. + * should be cleaned up throughout the stack eventually. */ return 0; } @@ -461,7 +461,7 @@ iomap_releasepage(struct page *page, gfp_t gfp_mask) /* * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to - * ->releasepage() via shrink_active_list(), skip those here. + * ->releasepage() via shrink_active_list(); skip those here. */ if (PageDirty(page) || PageWriteback(page)) return 0; @@ -476,7 +476,7 @@ iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) trace_iomap_invalidatepage(page->mapping->host, offset, len); /* - * If we are invalidating the entire page, clear the dirty state from it + * If we're invalidating the entire page, clear the dirty state from it * and release it to avoid unnecessary buildup of the LRU. */ if (offset == 0 && len == PAGE_SIZE) { @@ -658,13 +658,13 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, /* * The blocks that were entirely written will now be uptodate, so we * don't have to worry about a readpage reading them and overwriting a - * partial write. However if we have encountered a short write and only + * partial write. However, if we've encountered a short write and only * partially written into a block, it will not be marked uptodate, so a * readpage might come in and destroy our partial write. * - * Do the simplest thing, and just treat any short write to a non - * uptodate page as a zero-length write, and force the caller to redo - * the whole thing. + * Do the simplest thing and just treat any short write to a + * non-uptodate page as a zero-length write, and force the caller to + * redo the whole thing. */ if (unlikely(copied < len && !PageUptodate(page))) return 0; @@ -752,7 +752,7 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, bytes = length; /* - * Bring in the user page that we will copy from _first_. + * Bring in the user page that we'll copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. @@ -1161,7 +1161,7 @@ static void iomap_writepage_end_bio(struct bio *bio) * Submit the final bio for an ioend. * * If @error is non-zero, it means that we have a situation where some part of - * the submission process has failed after we have marked paged for writeback + * the submission process has failed after we've marked pages for writeback * and unlocked them. In this situation, we need to fail the bio instead of * submitting it. This typically only happens on a filesystem shutdown. */ @@ -1176,7 +1176,7 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, error = wpc->ops->prepare_ioend(ioend, error); if (error) { /* - * If we are failing the IO now, just mark the ioend with an + * If we're failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately * as there is only one reference to the ioend at this point in * time. @@ -1218,7 +1218,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, /* * Allocate a new bio, and chain the old bio to the new one. * - * Note that we have to do perform the chaining in this unintuitive order + * Note that we have to perform the chaining in this unintuitive order * so that the bi_private linkage is set up in the right direction for the * traversal in iomap_finish_ioend(). */ @@ -1257,7 +1257,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, /* * Test to see if we have an existing ioend structure that we could append to - * first, otherwise finish off the current ioend and start another. + * first; otherwise finish off the current ioend and start another. */ static void iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, @@ -1288,9 +1288,9 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate - * forward progress guarantees we need to provide. The current ioend we are - * adding blocks to is cached on the writepage context, and if the new block - * does not append to the cached ioend it will create a new ioend and cache that + * the forward progress guarantees we need to provide. The current ioend we're + * adding blocks to is cached in the writepage context, and if the new block + * doesn't append to the cached ioend, it will create a new ioend and cache that * instead. * * If a new ioend is created and cached, the old ioend is returned and queued @@ -1352,7 +1352,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, if (unlikely(error)) { /* * Let the filesystem know what portion of the current page - * failed to map. If the page wasn't been added to ioend, it + * failed to map. If the page hasn't been added to ioend, it * won't be affected by I/O completion and we must unlock it * now. */ @@ -1369,7 +1369,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, unlock_page(page); /* - * Preserve the original error if there was one, otherwise catch + * Preserve the original error if there was one; catch * submission errors here and propagate into subsequent ioend * submissions. */ @@ -1396,8 +1396,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, /* * Write out a dirty page. * - * For delalloc space on the page we need to allocate space and flush it. - * For unwritten space on the page we need to start the conversion to + * For delalloc space on the page, we need to allocate space and flush it. + * For unwritten space on the page, we need to start the conversion to * regular allocated space. */ static int @@ -1412,7 +1412,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); /* - * Refuse to write the page out if we are called from reclaim context. + * Refuse to write the page out if we're called from reclaim context. * * This avoids stack overflows when called from deeply used stacks in * random callers for direct reclaim or memcg reclaim. We explicitly @@ -1457,20 +1457,20 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) unsigned offset_into_page = offset & (PAGE_SIZE - 1); /* - * Skip the page if it is fully outside i_size, e.g. due to a - * truncate operation that is in progress. We must redirty the + * Skip the page if it's fully outside i_size, e.g. due to a + * truncate operation that's in progress. We must redirty the * page so that reclaim stops reclaiming it. Otherwise * iomap_vm_releasepage() is called on it and gets confused. * - * Note that the end_index is unsigned long, it would overflow - * if the given offset is greater than 16TB on 32-bit system - * and if we do check the page is fully outside i_size or not - * via "if (page->index >= end_index + 1)" as "end_index + 1" - * will be evaluated to 0. Hence this page will be redirtied - * and be written out repeatedly which would result in an - * infinite loop, the user program that perform this operation - * will hang. Instead, we can verify this situation by checking - * if the page to write is totally beyond the i_size or if it's + * Note that the end_index is unsigned long. If the given + * offset is greater than 16TB on a 32-bit system then if we + * checked if the page is fully outside i_size with + * "if (page->index >= end_index + 1)", "end_index + 1" would + * overflow and evaluate to 0. Hence this page would be + * redirtied and written out repeatedly, which would result in + * an infinite loop; the user program performing this operation + * would hang. Instead, we can detect this situation by + * checking if the page is totally beyond i_size or if its * offset is just equal to the EOF. */ if (page->index > end_index || From ab069d5fdcd14530d4223746c8d01f421d4c4057 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" <willy@infradead.org> Date: Wed, 4 Aug 2021 20:07:33 -0700 Subject: [PATCH 6/7] iomap: Use kmap_local_page instead of kmap_atomic kmap_atomic() has the side-effect of disabling pagefaults and preemption. kmap_local_page() does not do this and is preferred. Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/iomap/buffered-io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c1c8cd41ea81e..8ee0211bea860 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -223,10 +223,10 @@ static int iomap_read_inline_data(struct inode *inode, struct page *page, if (poff > 0) iomap_page_create(inode, page); - addr = kmap_atomic(page) + poff; + addr = kmap_local_page(page) + poff; memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - poff - size); - kunmap_atomic(addr); + kunmap_local(addr); iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); return PAGE_SIZE - poff; } @@ -682,9 +682,9 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page, BUG_ON(!iomap_inline_data_valid(iomap)); flush_dcache_page(page); - addr = kmap_atomic(page); - memcpy(iomap_inline_data(iomap, pos), addr + pos, copied); - kunmap_atomic(addr); + addr = kmap_local_page(page) + pos; + memcpy(iomap_inline_data(iomap, pos), addr, copied); + kunmap_local(addr); mark_inode_dirty(inode); return copied; From ae44f9c286da3fbb3f827076403ea64fa9adfef2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" <willy@infradead.org> Date: Wed, 4 Aug 2021 20:07:34 -0700 Subject: [PATCH 7/7] iomap: Add another assertion to inline data handling Check that the file tail does not cross a page boundary. Requested by Andreas. Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/iomap/buffered-io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8ee0211bea860..586d9d078ce10 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -215,6 +215,8 @@ static int iomap_read_inline_data(struct inode *inode, struct page *page, if (PageUptodate(page)) return PAGE_SIZE - poff; + if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) + return -EIO; if (WARN_ON_ONCE(size > PAGE_SIZE - offset_in_page(iomap->inline_data))) return -EIO;