From 658a52344fb139f9531e7543a6e0015b630feb38 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 23 Oct 2023 09:30:54 +0800 Subject: [PATCH 01/18] ext4: unify the type of flexbg_size to unsigned int The maximum value of flexbg_size is 2^31, but the maximum value of int is (2^31 - 1), so overflow may occur when the type of flexbg_size is declared as int. For example, when uninit_mask is initialized in ext4_alloc_group_tables(), if flexbg_size == 2^31, the initialized uninit_mask is incorrect, and this may causes set_flexbg_block_bitmap() to trigger a BUG_ON(). Therefore, the flexbg_size type is declared as unsigned int to avoid overflow and memory waste. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231023013057.2117948-2-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 4fe061edefdde..c6d4539d4c1fa 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -228,7 +228,7 @@ struct ext4_new_flex_group_data { * * Returns NULL on failure otherwise address of the allocated structure. */ -static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size) { struct ext4_new_flex_group_data *flex_gd; @@ -283,7 +283,7 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) */ static int ext4_alloc_group_tables(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, - int flexbg_size) + unsigned int flexbg_size) { struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t start_blk; @@ -384,12 +384,12 @@ static int ext4_alloc_group_tables(struct super_block *sb, group = group_data[0].group; printk(KERN_DEBUG "EXT4-fs: adding a flex group with " - "%d groups, flexbg size is %d:\n", flex_gd->count, + "%u groups, flexbg size is %u:\n", flex_gd->count, flexbg_size); for (i = 0; i < flex_gd->count; i++) { ext4_debug( - "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n", + "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n", ext4_bg_has_super(sb, group + i) ? "normal" : "no-super", group + i, group_data[i].blocks_count, @@ -1606,7 +1606,7 @@ static int ext4_flex_group_add(struct super_block *sb, static int ext4_setup_next_flex_gd(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, ext4_fsblk_t n_blocks_count, - unsigned long flexbg_size) + unsigned int flexbg_size) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; @@ -1990,8 +1990,9 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_fsblk_t o_blocks_count; ext4_fsblk_t n_blocks_count_retry = 0; unsigned long last_update_time = 0; - int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; + int err = 0; int meta_bg; + unsigned int flexbg_size = ext4_flex_bg_size(sbi); /* See if the device is actually as big as what was requested */ bh = ext4_sb_bread(sb, n_blocks_count - 1, 0); From b099eb87de105cf07cad731ded6fb40b2675108b Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 23 Oct 2023 09:30:55 +0800 Subject: [PATCH 02/18] ext4: remove unnecessary check from alloc_flex_gd() In commit 967ac8af4475 ("ext4: fix potential integer overflow in alloc_flex_gd()"), an overflow check is added to alloc_flex_gd() to prevent the allocated memory from being smaller than expected due to the overflow. However, after kmalloc() is replaced with kmalloc_array() in commit 6da2ec56059c ("treewide: kmalloc() -> kmalloc_array()"), the kmalloc_array() function has an overflow check, so the above problem will not occur. Therefore, the extra check is removed. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231023013057.2117948-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c6d4539d4c1fa..0a57b199883ca 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -236,10 +236,7 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size) if (flex_gd == NULL) goto out3; - if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data)) - goto out2; flex_gd->count = flexbg_size; - flex_gd->groups = kmalloc_array(flexbg_size, sizeof(struct ext4_new_group_data), GFP_NOFS); From 5d1935ac02ca5aee364a449a35e2977ea84509b0 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 23 Oct 2023 09:30:56 +0800 Subject: [PATCH 03/18] ext4: avoid online resizing failures due to oversized flex bg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we online resize an ext4 filesystem with a oversized flexbg_size, mkfs.ext4 -F -G 67108864 $dev -b 4096 100M mount $dev $dir resize2fs $dev 16G the following WARN_ON is triggered: ================================================================== WARNING: CPU: 0 PID: 427 at mm/page_alloc.c:4402 __alloc_pages+0x411/0x550 Modules linked in: sg(E) CPU: 0 PID: 427 Comm: resize2fs Tainted: G E 6.6.0-rc5+ #314 RIP: 0010:__alloc_pages+0x411/0x550 Call Trace: __kmalloc_large_node+0xa2/0x200 __kmalloc+0x16e/0x290 ext4_resize_fs+0x481/0xd80 __ext4_ioctl+0x1616/0x1d90 ext4_ioctl+0x12/0x20 __x64_sys_ioctl+0xf0/0x150 do_syscall_64+0x3b/0x90 ================================================================== This is because flexbg_size is too large and the size of the new_group_data array to be allocated exceeds MAX_ORDER. Currently, the minimum value of MAX_ORDER is 8, the minimum value of PAGE_SIZE is 4096, the corresponding maximum number of groups that can be allocated is: (PAGE_SIZE << MAX_ORDER) / sizeof(struct ext4_new_group_data) ≈ 21845 And the value that is down-aligned to the power of 2 is 16384. Therefore, this value is defined as MAX_RESIZE_BG, and the number of groups added each time does not exceed this value during resizing, and is added multiple times to complete the online resizing. The difference is that the metadata in a flex_bg may be more dispersed. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231023013057.2117948-4-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0a57b199883ca..e168a9f596001 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -218,10 +218,17 @@ struct ext4_new_flex_group_data { in the flex group */ __u16 *bg_flags; /* block group flags of groups in @groups */ + ext4_group_t resize_bg; /* number of allocated + new_group_data */ ext4_group_t count; /* number of groups in @groups */ }; +/* + * Avoiding memory allocation failures due to too many groups added each time. + */ +#define MAX_RESIZE_BG 16384 + /* * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of * @flexbg_size. @@ -236,14 +243,18 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size) if (flex_gd == NULL) goto out3; - flex_gd->count = flexbg_size; - flex_gd->groups = kmalloc_array(flexbg_size, + if (unlikely(flexbg_size > MAX_RESIZE_BG)) + flex_gd->resize_bg = MAX_RESIZE_BG; + else + flex_gd->resize_bg = flexbg_size; + + flex_gd->groups = kmalloc_array(flex_gd->resize_bg, sizeof(struct ext4_new_group_data), GFP_NOFS); if (flex_gd->groups == NULL) goto out2; - flex_gd->bg_flags = kmalloc_array(flexbg_size, sizeof(__u16), + flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16), GFP_NOFS); if (flex_gd->bg_flags == NULL) goto out1; @@ -1602,8 +1613,7 @@ static int ext4_flex_group_add(struct super_block *sb, static int ext4_setup_next_flex_gd(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, - ext4_fsblk_t n_blocks_count, - unsigned int flexbg_size) + ext4_fsblk_t n_blocks_count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; @@ -1627,7 +1637,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, BUG_ON(last); ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); - last_group = group | (flexbg_size - 1); + last_group = group | (flex_gd->resize_bg - 1); if (last_group > n_group) last_group = n_group; @@ -2130,8 +2140,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) /* Add flex groups. Note that a regular group is a * flex group with 1 group. */ - while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, - flexbg_size)) { + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) { if (time_is_before_jiffies(last_update_time + HZ * 10)) { if (last_update_time) ext4_msg(sb, KERN_INFO, From 665d3e0af4d35acf9a5f58dfd471bc27dbf55880 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 23 Oct 2023 09:30:57 +0800 Subject: [PATCH 04/18] ext4: reduce unnecessary memory allocation in alloc_flex_gd() When a large flex_bg file system is resized, the number of groups to be added may be small, and a large amount of memory that will not be used will be allocated. Therefore, resize_bg can be set to the size after the number of new_group_data to be used is aligned upwards to the power of 2. This does not affect the disk layout after online resize and saves some memory. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231023013057.2117948-5-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e168a9f596001..4d4a5a32e310d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -235,8 +235,10 @@ struct ext4_new_flex_group_data { * * Returns NULL on failure otherwise address of the allocated structure. */ -static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size) +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size, + ext4_group_t o_group, ext4_group_t n_group) { + ext4_group_t last_group; struct ext4_new_flex_group_data *flex_gd; flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); @@ -248,6 +250,14 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size) else flex_gd->resize_bg = flexbg_size; + /* Avoid allocating large 'groups' array if not needed */ + last_group = o_group | (flex_gd->resize_bg - 1); + if (n_group <= last_group) + flex_gd->resize_bg = 1 << fls(n_group - o_group + 1); + else if (n_group - last_group < flex_gd->resize_bg) + flex_gd->resize_bg = 1 << max(fls(last_group - o_group + 1), + fls(n_group - last_group)); + flex_gd->groups = kmalloc_array(flex_gd->resize_bg, sizeof(struct ext4_new_group_data), GFP_NOFS); @@ -2131,7 +2141,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) if (err) goto out; - flex_gd = alloc_flex_gd(flexbg_size); + flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group); if (flex_gd == NULL) { err = -ENOMEM; goto out; From f2fec3e99a32d7c14dbf63c824f8286ebc94b18d Mon Sep 17 00:00:00 2001 From: Gou Hao Date: Tue, 24 Oct 2023 11:52:15 +0800 Subject: [PATCH 05/18] ext4: delete redundant calculations in ext4_mb_get_buddy_page_lock() 'blocks_per_page' is always 1 after 'if (blocks_per_page >= 2)', 'pnum' and 'block' are equal in this case. Signed-off-by: Gou Hao Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231024035215.29474-1-gouhao@uniontech.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d72b5e3c92ec4..847dc0fb15735 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1456,9 +1456,8 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, return 0; } - block++; - pnum = block / blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); + /* blocks_per_page == 1, hence we need another page for the buddy */ + page = find_or_create_page(inode->i_mapping, block + 1, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); From e89fdcc425b6feea4dfb33877e9256757905d763 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Wed, 1 Nov 2023 21:17:17 +0530 Subject: [PATCH 06/18] ext4: enable dioread_nolock as default for bs < ps case dioread_nolock was originally disabled as a default option for bs < ps scenarios due to a data corruption issue. Since then, we've had some fixes in this area which address such issues. Enable dioread_nolock by default and remove the experimental warning message for bs < ps path. dioread for bs < ps has been tested on a 64k pagesize machine using: kvm-xfstest -C 3 -g auto with the following configs: 64k adv bigalloc_4k bigalloc_64k data_journal encrypt dioread_nolock dioread_nolock_4k ext3 ext3conv nojournal And no new regressions were seen compared to baseline kernel. Suggested-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20231101154717.531865-1-ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c5fcf377ab1fa..783a755078cfd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2793,15 +2793,6 @@ static int ext4_check_opt_consistency(struct fs_context *fc, return -EINVAL; } - if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) { - int blocksize = - BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (blocksize < PAGE_SIZE) - ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an " - "experimental mount option 'dioread_nolock' " - "for blocksize < PAGE_SIZE"); - } - err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; @@ -4410,7 +4401,7 @@ static void ext4_set_def_opts(struct super_block *sb, ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - if (sb->s_blocksize == PAGE_SIZE) + if (sb->s_blocksize <= PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } From 92573369144f40397e8514440afdf59f24905b40 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Wed, 1 Nov 2023 22:08:10 +0530 Subject: [PATCH 07/18] ext4: treat end of range as exclusive in ext4_zero_range() The call to filemap_write_and_wait_range() assumes the range passed to be inclusive, so fix the call to make sure we follow that. Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/e503107a7c73a2b68dec645c5ad798c437717c45.1698856309.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d5efe076d3d3f..01299b55a567a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4523,7 +4523,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the - * range. + * range. Here, start and partial_begin are inclusive, end and + * partial_end are exclusive. */ start = round_up(offset, 1 << blkbits); end = round_down((offset + len), 1 << blkbits); @@ -4609,7 +4610,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, * disk in case of crash before zeroing trans is committed. */ if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, end); + ret = filemap_write_and_wait_range(mapping, start, + end - 1); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; From c6bfd724098457a1162a7b9fef07af176720055b Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Wed, 1 Nov 2023 22:08:11 +0530 Subject: [PATCH 08/18] ext4: clarify handling of unwritten bh in __ext4_block_zero_page_range() As an optimization, I was trying to work on exiting early from this function if dealing with unwritten extent since they anyways read 0. However, it was realised that there are certain code paths that can end up calling ext4_block_zero_page_range() for an unwritten bh that might still have data in pagecache. In this case, we can't exit early and we do require to process the bh and zero out the pagecache to ensure that a writeback can't kick in at a later time and flush the stale pagecache to disk. Since, adding the logic to exit early for unwritten bh was turning out to be much more nuanced and the current code already handles it well, just add a comment to explicitly document this behavior. Suggested-by: Jan Kara Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/d859b7ae5fe42e6626479b91ed9f4da3aae4c597.1698856309.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 61277f7f87225..96d3211508a0b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3630,6 +3630,12 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } +/* + * Here we can't skip an unwritten buffer even though it usually reads zero + * because it might have data in pagecache (eg, if called from ext4_zero_range, + * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a + * racing writeback can come later and flush the stale pagecache to disk. + */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { From 2bf5eb2a7c22fc3dd011fda2722fd369b1c4608b Mon Sep 17 00:00:00 2001 From: Gou Hao Date: Mon, 13 Nov 2023 16:26:17 +0800 Subject: [PATCH 09/18] ext4: improving calculation of 'fe_{len|start}' in mb_find_extent() After first execution of mb_find_order_for_block(): 'fe_start' is the value of 'block' passed in mb_find_extent(). 'fe_len' is the difference between the length of order-chunk and remainder of the block divided by order-chunk. And 'next' does not require initialization after above modifications. Signed-off-by: Gou Hao Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231113082617.11258-1-gouhao@uniontech.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 847dc0fb15735..0c69f2f8cf885 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1957,8 +1957,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { - int next = block; - int max, order; + int max, order, next; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); @@ -1976,16 +1975,12 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, /* find actual order */ order = mb_find_order_for_block(e4b, block); - block = block >> order; - ex->fe_len = 1 << order; - ex->fe_start = block << order; + ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); + ex->fe_start = block; ex->fe_group = e4b->bd_group; - /* calc difference from given start */ - next = next - ex->fe_start; - ex->fe_len -= next; - ex->fe_start += next; + block = block >> order; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { From 990b6b5b13b7993b7f44740c0add3119d407ccbf Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 13 Dec 2023 09:32:20 +0800 Subject: [PATCH 10/18] jbd2: add errseq to detect client fs's bdev writeback error Add errseq in journal, so that JBD2 can detect whether metadata is successfully written to fs bdev. This patch adds detection in recovery process to replace original solution(using local variable wb_err). Signed-off-by: Zhihao Cheng Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231213013224.2100050-2-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 1 + fs/jbd2/recovery.c | 7 +------ include/linux/jbd2.h | 26 ++++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 206cb53ef2b06..559938a82379a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1534,6 +1534,7 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_total_len = len; + jbd2_init_fs_dev_write_error(journal); err = journal_load_superblock(journal); if (err) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 01f744cb97a40..1f7664984d6e4 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -289,8 +289,6 @@ int jbd2_journal_recover(journal_t *journal) journal_superblock_t * sb; struct recovery_info info; - errseq_t wb_err; - struct address_space *mapping; memset(&info, 0, sizeof(info)); sb = journal->j_superblock; @@ -308,9 +306,6 @@ int jbd2_journal_recover(journal_t *journal) return 0; } - wb_err = 0; - mapping = journal->j_fs_dev->bd_inode->i_mapping; - errseq_check_and_advance(&mapping->wb_err, &wb_err); err = do_one_pass(journal, &info, PASS_SCAN); if (!err) err = do_one_pass(journal, &info, PASS_REVOKE); @@ -334,7 +329,7 @@ int jbd2_journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; - err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err); + err2 = jbd2_check_fs_dev_write_error(journal); if (!err) err = err2; /* Make sure all replayed data is on permanent storage */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index beb30719ee161..cea1aa70ae36f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -998,6 +998,13 @@ struct journal_s */ struct block_device *j_fs_dev; + /** + * @j_fs_dev_wb_err: + * + * Records the errseq of the client fs's backing block device. + */ + errseq_t j_fs_dev_wb_err; + /** * @j_total_len: Total maximum capacity of the journal region on disk. */ @@ -1698,6 +1705,25 @@ static inline void jbd2_journal_abort_handle(handle_t *handle) handle->h_aborted = 1; } +static inline void jbd2_init_fs_dev_write_error(journal_t *journal) +{ + struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping; + + /* + * Save the original wb_err value of client fs's bdev mapping which + * could be used to detect the client fs's metadata async write error. + */ + errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err); +} + +static inline int jbd2_check_fs_dev_write_error(journal_t *journal) +{ + struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping; + + return errseq_check(&mapping->wb_err, + READ_ONCE(journal->j_fs_dev_wb_err)); +} + #endif /* __KERNEL__ */ /* Comparison functions for transaction IDs: perform comparisons using From 62ec1707cb071c95706d1bab85fbee8d5a3d2f24 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 13 Dec 2023 09:32:21 +0800 Subject: [PATCH 11/18] jbd2: replace journal state flag by checking errseq Now JBD2 detects metadata writeback error of fs dev according to errseq. Replace journal state flag by checking errseq. Signed-off-by: Zhihao Cheng Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231213013224.2100050-3-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 559938a82379a..b6c114c11b978 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1862,7 +1862,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, if (is_journal_aborted(journal)) return -EIO; - if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) { + if (jbd2_check_fs_dev_write_error(journal)) { jbd2_journal_abort(journal, -EIO); return -EIO; } @@ -2160,12 +2160,12 @@ int jbd2_journal_destroy(journal_t *journal) /* * OK, all checkpoint transactions have been checked, now check the - * write out io error flag and abort the journal if some buffer failed - * to write back to the original location, otherwise the filesystem - * may become inconsistent. + * writeback errseq of fs dev and abort the journal if some buffer + * failed to write back to the original location, otherwise the + * filesystem may become inconsistent. */ if (!is_journal_aborted(journal) && - test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) + jbd2_check_fs_dev_write_error(journal)) jbd2_journal_abort(journal, -EIO); if (journal->j_sb_buffer) { From 8a4fd33d879fb303b207f06ea6340d73f698c4ed Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 13 Dec 2023 09:32:22 +0800 Subject: [PATCH 12/18] jbd2: remove unused 'JBD2_CHECKPOINT_IO_ERROR' and 'j_atomic_flags' Since 'JBD2_CHECKPOINT_IO_ERROR' and j_atomic_flags' are not useful anymore after fs dev's errseq is imported into jbd2, just remove them. Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231213013224.2100050-4-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 11 ----------- include/linux/jbd2.h | 11 ----------- 2 files changed, 22 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 118699fff2f90..1c97e64c47849 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -556,7 +556,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; - struct buffer_head *bh = jh2bh(jh); JBUFFER_TRACE(jh, "entry"); @@ -569,16 +568,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) JBUFFER_TRACE(jh, "removing from transaction"); - /* - * If we have failed to write the buffer out to disk, the filesystem - * may become inconsistent. We cannot abort the journal here since - * we hold j_list_lock and we have to be careful about races with - * jbd2_journal_destroy(). So mark the writeback IO error in the - * journal here and we abort the journal later from a better context. - */ - if (buffer_write_io_error(bh)) - set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags); - __buffer_unlink(jh); jh->b_cp_transaction = NULL; percpu_counter_dec(&journal->j_checkpoint_jh_count); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index cea1aa70ae36f..971f3e826e152 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -755,11 +755,6 @@ struct journal_s */ unsigned long j_flags; - /** - * @j_atomic_flags: Atomic journaling state flags. - */ - unsigned long j_atomic_flags; - /** * @j_errno: * @@ -1406,12 +1401,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ JBD2_JOURNAL_FLUSH_ZEROOUT) -/* - * Journal atomic flag definitions - */ -#define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing - * buffer back to disk */ - /* * Function declarations for the journaling transaction and buffer * management From b4e73e61268903d82dacff2bc6f4bb766c6ed555 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 13 Dec 2023 09:32:23 +0800 Subject: [PATCH 13/18] jbd2: abort journal when detecting metadata writeback error of fs dev This is a replacement solution of commit bc71726c725767 ("ext4: abort the filesystem if failed to async write metadata buffer"), JBD2 can detect metadata writeback error of fs dev by 'j_fs_dev_wb_err'. Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231213013224.2100050-5-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5f08b5fd105a3..cb0b8d6fc0c6d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1231,11 +1231,25 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh, int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { struct journal_head *jh; + journal_t *journal; int rc; if (is_handle_aborted(handle)) return -EROFS; + journal = handle->h_transaction->t_journal; + if (jbd2_check_fs_dev_write_error(journal)) { + /* + * If the fs dev has writeback errors, it may have failed + * to async write out metadata buffers in the background. + * In this case, we could read old data from disk and write + * it out again, which may lead to on-disk filesystem + * inconsistency. Aborting journal can avoid it happen. + */ + jbd2_journal_abort(journal, -EIO); + return -EIO; + } + if (jbd2_write_access_granted(handle, bh, false)) return 0; From ada3fb86a3f3aea40903d5ad9aeec708dc049b8b Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 13 Dec 2023 09:32:24 +0800 Subject: [PATCH 14/18] ext4: move ext4_check_bdev_write_error() into nojournal mode Since JBD2 takes care of all metadata writeback errors of fs dev, ext4_check_bdev_write_error() is useful only in nojournal mode. Move it into '!ext4_handle_valid(handle)' branch. Signed-off-by: Zhihao Cheng Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231213013224.2100050-6-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d1a2e66244017..5d8055161acdb 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -235,8 +235,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); - ext4_check_bdev_write_error(sb); - if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) { @@ -244,7 +242,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, handle, err); return err; } - } + } else + ext4_check_bdev_write_error(sb); if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); From 7c784d624819acbeefb0018bac89e632467cca5a Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Wed, 13 Dec 2023 16:16:35 +1100 Subject: [PATCH 15/18] ext4: allow for the last group to be marked as trimmed The ext4 filesystem tracks the trim status of blocks at the group level. When an entire group has been trimmed then it is marked as such and subsequent trim invocations with the same minimum trim size will not be attempted on that group unless it is marked as able to be trimmed again such as when a block is freed. Currently the last group can't be marked as trimmed due to incorrect logic in ext4_last_grp_cluster(). ext4_last_grp_cluster() is supposed to return the zero based index of the last cluster in a group. This is then used by ext4_try_to_trim_range() to determine if the trim operation spans the entire group and as such if the trim status of the group should be recorded. ext4_last_grp_cluster() takes a 0 based group index, thus the valid values for grp are 0..(ext4_get_groups_count - 1). Any group index less than (ext4_get_groups_count - 1) is not the last group and must have EXT4_CLUSTERS_PER_GROUP(sb) clusters. For the last group we need to calculate the number of clusters based on the number of blocks in the group. Finally subtract 1 from the number of clusters as zero based indexing is expected. Rearrange the function slightly to make it clear what we are calculating and returning. Reproducer: // Create file system where the last group has fewer blocks than // blocks per group $ mkfs.ext4 -b 4096 -g 8192 /dev/nvme0n1 8191 $ mount /dev/nvme0n1 /mnt Before Patch: $ fstrim -v /mnt /mnt: 25.9 MiB (27156480 bytes) trimmed // Group not marked as trimmed so second invocation still discards blocks $ fstrim -v /mnt /mnt: 25.9 MiB (27156480 bytes) trimmed After Patch: fstrim -v /mnt /mnt: 25.9 MiB (27156480 bytes) trimmed // Group marked as trimmed so second invocation DOESN'T discard any blocks fstrim -v /mnt /mnt: 0 B (0 bytes) trimmed Fixes: 45e4ab320c9b ("ext4: move setting of trimmed bit into ext4_try_to_trim_range()") Cc: # 4.19+ Signed-off-by: Suraj Jitindar Singh Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231213051635.37731-1-surajjs@amazon.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0c69f2f8cf885..4b72af2908d9c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6729,11 +6729,16 @@ __acquires(bitlock) static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, ext4_group_t grp) { - if (grp < ext4_get_groups_count(sb)) - return EXT4_CLUSTERS_PER_GROUP(sb) - 1; - return (ext4_blocks_count(EXT4_SB(sb)->s_es) - - ext4_group_first_block_no(sb, grp) - 1) >> - EXT4_CLUSTER_BITS(sb); + unsigned long nr_clusters_in_group; + + if (grp < (ext4_get_groups_count(sb) - 1)) + nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb); + else + nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, grp)) + >> EXT4_CLUSTER_BITS(sb); + + return nr_clusters_in_group - 1; } static bool ext4_trim_interrupted(void) From 4d5cdd757d0c74924b629559fccb68d8803ce995 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Dec 2023 05:30:35 +0000 Subject: [PATCH 16/18] ext4: convert ext4_da_do_write_end() to take a folio There's nothing page-specific happening in ext4_da_do_write_end(); it's merely used for its refcount & lock, both of which are folio properties. Saves four calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231214053035.1018876-1-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 96d3211508a0b..1eed6285191b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2947,7 +2947,7 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page) + struct folio *folio) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -2958,12 +2958,13 @@ static int ext4_da_do_write_end(struct address_space *mapping, * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ - copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); + copied = block_write_end(NULL, mapping, pos, len, copied, + &folio->page, NULL); new_i_size = pos + copied; /* - * It's important to update i_size while still holding page lock, - * because page writeout could otherwise come in and zero beyond + * It's important to update i_size while still holding folio lock, + * because folio writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= @@ -2981,14 +2982,14 @@ static int ext4_da_do_write_end(struct address_space *mapping, i_size_write(inode, new_i_size); end = (new_i_size - 1) & (PAGE_SIZE - 1); - if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { + if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); @@ -3027,10 +3028,10 @@ static int ext4_da_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - if (unlikely(copied < len) && !PageUptodate(page)) + if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; - return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); + return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* From 1f6bc02f18489b9c9ea39b068d0695fb0e4567e9 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 15 Dec 2023 16:49:50 +0530 Subject: [PATCH 17/18] ext4: fallback to complex scan if aligned scan doesn't work Currently in case the goal length is a multiple of stripe size we use ext4_mb_scan_aligned() to find the stripe size aligned physical blocks. In case we are not able to find any, we again go back to calling ext4_mb_choose_next_group() to search for a different suitable block group. However, since the linear search always begins from the start, most of the times we end up with the same BG and the cycle continues. With large fliesystems, the CPU can be stuck in this loop for hours which can slow down the whole system. Hence, until we figure out a better way to continue the search (rather than starting from beginning) in ext4_mb_choose_next_group(), lets just fallback to ext4_mb_complex_scan_group() in case aligned scan fails, as it is much more likely to find the needed blocks. Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/ee033f6dfa0a7f2934437008a909c3788233950f.1702455010.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4b72af2908d9c..ebbceb2bac4d3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2889,14 +2889,19 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_groups_scanned++; if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); - else if ((cr == CR_GOAL_LEN_FAST || - cr == CR_BEST_AVAIL_LEN) && - sbi->s_stripe && - !(ac->ac_g_ex.fe_len % - EXT4_B2C(sbi, sbi->s_stripe))) - ext4_mb_scan_aligned(ac, &e4b); - else - ext4_mb_complex_scan_group(ac, &e4b); + else { + bool is_stripe_aligned = sbi->s_stripe && + !(ac->ac_g_ex.fe_len % + EXT4_B2C(sbi, sbi->s_stripe)); + + if ((cr == CR_GOAL_LEN_FAST || + cr == CR_BEST_AVAIL_LEN) && + is_stripe_aligned) + ext4_mb_scan_aligned(ac, &e4b); + + if (ac->ac_status == AC_STATUS_CONTINUE) + ext4_mb_complex_scan_group(ac, &e4b); + } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); From 68da4c44b994aea797eb9821acb3a4a36015293e Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 16 Dec 2023 09:09:19 +0800 Subject: [PATCH 18/18] ext4: fix inconsistent between segment fstrim and full fstrim Suppose we issue two FITRIM ioctls for ranges [0,15] and [16,31] with mininum length of trimmed range set to 8 blocks. If we have say a range of blocks 10-22 free, this range will not be trimmed because it straddles the boundary of the two FITRIM ranges and neither part is big enough. This is a bit surprising to some users that call FITRIM on smaller ranges of blocks to limit impact on the system. Also XFS trims all free space extents that overlap with the specified range so we are inconsistent among filesystems. Let's change ext4_try_to_trim_range() to consider for trimming the whole free space extent that straddles the end of specified range, not just the part of it within the range. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231216010919.1995851-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ebbceb2bac4d3..f44f668e407f2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6757,13 +6757,15 @@ static int ext4_try_to_trim_range(struct super_block *sb, __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { - ext4_grpblk_t next, count, free_count; + ext4_grpblk_t next, count, free_count, last, origin_start; bool set_trimmed = false; void *bitmap; + last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; - if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group)) + if (start == 0 && max >= last) set_trimmed = true; + origin_start = start; start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -6772,7 +6774,10 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) start = mb_find_next_zero_bit(bitmap, max + 1, start); if (start > max) break; - next = mb_find_next_bit(bitmap, max + 1, start); + + next = mb_find_next_bit(bitmap, last + 1, start); + if (origin_start == 0 && next >= last) + set_trimmed = true; if ((next - start) >= minblocks) { int ret = ext4_trim_extent(sb, start, next - start, e4b);