From a36e0ab44cb344728f7c0fdc34edcbae64739c16 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Wed, 12 Feb 2025 14:18:12 +0800 Subject: [PATCH 1/7] exfat: support batch discard of clusters when freeing clusters If the discard mount option is enabled, the file's clusters are discarded when the clusters are freed. Discarding clusters one by one will significantly reduce performance. Poor performance may cause soft lockup when lots of clusters are freed. This commit improves performance by discarding contiguous clusters in batches. Measure the performance by: # truncate -s 80G /mnt/file # time rm /mnt/file Without this commit: real 4m46.183s user 0m0.000s sys 0m12.863s With this commit: real 0m1.661s user 0m0.000s sys 0m0.017s Signed-off-by: Yuezhang Mo Signed-off-by: Namjae Jeon --- fs/exfat/balloc.c | 14 -------------- fs/exfat/fatent.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 9ff825f1502d..cc01556c9d9b 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -147,7 +147,6 @@ int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) unsigned int ent_idx; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct exfat_mount_options *opts = &sbi->options; if (!is_valid_cluster(sbi, clu)) return -EIO; @@ -163,19 +162,6 @@ int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) exfat_update_bh(sbi->vol_amap[i], sync); - if (opts->discard) { - int ret_discard; - - ret_discard = sb_issue_discard(sb, - exfat_cluster_to_sector(sbi, clu), - (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0); - - if (ret_discard == -EOPNOTSUPP) { - exfat_err(sb, "discard not supported by device, disabling"); - opts->discard = 0; - } - } - return 0; } diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 6f3651c6ca91..b9473a69f104 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -144,6 +144,20 @@ int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain, return 0; } +static inline void exfat_discard_cluster(struct super_block *sb, + unsigned int clu, unsigned int num_clusters) +{ + int ret; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + ret = sb_issue_discard(sb, exfat_cluster_to_sector(sbi, clu), + sbi->sect_per_clus * num_clusters, GFP_NOFS, 0); + if (ret == -EOPNOTSUPP) { + exfat_err(sb, "discard not supported by device, disabling"); + sbi->options.discard = 0; + } +} + /* This function must be called with bitmap_lock held */ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) { @@ -196,7 +210,12 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain clu++; num_clusters++; } while (num_clusters < p_chain->size); + + if (sbi->options.discard) + exfat_discard_cluster(sb, p_chain->dir, p_chain->size); } else { + unsigned int nr_clu = 1; + do { bool sync = false; unsigned int n_clu = clu; @@ -215,6 +234,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain if (exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)))) break; + + if (sbi->options.discard) { + if (n_clu == clu + 1) + nr_clu++; + else { + exfat_discard_cluster(sb, clu - nr_clu + 1, nr_clu); + nr_clu = 1; + } + } + clu = n_clu; num_clusters++; From f6369ae1f088cb6d18d7a07eec95d7c10c2a2a5e Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Wed, 19 Feb 2025 13:00:09 -0600 Subject: [PATCH 2/7] exfat: remove count used cluster from exfat_statfs() The callback function statfs() is called only after the file system is mounted. During the process of mounting the exFAT file system, the number of used clusters has been counted, so the condition "sbi->used_clusters == EXFAT_CLUSTERS_UNTRACKED" is always false and should be deleted. Signed-off-by: Yuezhang Mo Signed-off-by: Namjae Jeon --- fs/exfat/exfat_fs.h | 2 -- fs/exfat/super.c | 10 ---------- 2 files changed, 12 deletions(-) diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index d30ce18a88b7..f8ead4d47ef0 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -14,8 +14,6 @@ #define EXFAT_ROOT_INO 1 -#define EXFAT_CLUSTERS_UNTRACKED (~0u) - /* * exfat error flags */ diff --git a/fs/exfat/super.c b/fs/exfat/super.c index bd57844414aa..8465033a6cf0 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -67,15 +67,6 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned long long id = huge_encode_dev(sb->s_bdev->bd_dev); - if (sbi->used_clusters == EXFAT_CLUSTERS_UNTRACKED) { - mutex_lock(&sbi->s_lock); - if (exfat_count_used_clusters(sb, &sbi->used_clusters)) { - mutex_unlock(&sbi->s_lock); - return -EIO; - } - mutex_unlock(&sbi->s_lock); - } - buf->f_type = sb->s_magic; buf->f_bsize = sbi->cluster_size; buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */ @@ -531,7 +522,6 @@ static int exfat_read_boot_sector(struct super_block *sb) sbi->vol_flags = le16_to_cpu(p_boot->vol_flags); sbi->vol_flags_persistent = sbi->vol_flags & (VOLUME_DIRTY | MEDIA_FAILURE); sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; - sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED; /* check consistencies */ if ((u64)sbi->num_FAT_sectors << p_boot->sect_size_bits < From 1bb7ff4204b6d4927e982cd256286c09ed4fd8ca Mon Sep 17 00:00:00 2001 From: Sungjong Seo Date: Fri, 21 Mar 2025 15:34:42 +0900 Subject: [PATCH 3/7] exfat: fix random stack corruption after get_block When get_block is called with a buffer_head allocated on the stack, such as do_mpage_readpage, stack corruption due to buffer_head UAF may occur in the following race condition situation. mpage_read_folio <> do_mpage_readpage exfat_get_block bh_read __bh_read get_bh(bh) submit_bh wait_on_buffer ... end_buffer_read_sync __end_buffer_read_notouch unlock_buffer <> ... ... ... ... <> . . another_function <> put_bh(bh) atomic_dec(bh->b_count) * stack corruption here * This patch returns -EAGAIN if a folio does not have buffers when bh_read needs to be called. By doing this, the caller can fallback to functions like block_read_full_folio(), create a buffer_head in the folio, and then call get_block again. Let's do not call bh_read() with on-stack buffer_head. Fixes: 11a347fb6cef ("exfat: change to get file size from DataLength") Cc: stable@vger.kernel.org Tested-by: Yeongjin Gil Signed-off-by: Sungjong Seo Reviewed-by: Yuezhang Mo Signed-off-by: Namjae Jeon --- fs/exfat/inode.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 96952d4acb50..f3fdba9f4d21 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -344,7 +344,8 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, * The block has been partially written, * zero the unwritten part and map the block. */ - loff_t size, off, pos; + loff_t size, pos; + void *addr; max_blocks = 1; @@ -355,17 +356,41 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, if (!bh_result->b_folio) goto done; + /* + * No buffer_head is allocated. + * (1) bmap: It's enough to fill bh_result without I/O. + * (2) read: The unwritten part should be filled with 0 + * If a folio does not have any buffers, + * let's returns -EAGAIN to fallback to + * per-bh IO like block_read_full_folio(). + */ + if (!folio_buffers(bh_result->b_folio)) { + err = -EAGAIN; + goto done; + } + pos = EXFAT_BLK_TO_B(iblock, sb); size = ei->valid_size - pos; - off = pos & (PAGE_SIZE - 1); + addr = folio_address(bh_result->b_folio) + + offset_in_folio(bh_result->b_folio, pos); + + /* Check if bh->b_data points to proper addr in folio */ + if (bh_result->b_data != addr) { + exfat_fs_error_ratelimit(sb, + "b_data(%p) != folio_addr(%p)", + bh_result->b_data, addr); + err = -EINVAL; + goto done; + } - folio_set_bh(bh_result, bh_result->b_folio, off); + /* Read a block */ err = bh_read(bh_result, 0); if (err < 0) - goto unlock_ret; + goto done; - folio_zero_segment(bh_result->b_folio, off + size, - off + sb->s_blocksize); + /* Zero unwritten part of a block */ + memset(bh_result->b_data + size, 0, + bh_result->b_size - size); } else { /* * The range has not been written, clear the mapped flag @@ -376,6 +401,8 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, } done: bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); + if (err < 0) + clear_buffer_mapped(bh_result); unlock_ret: mutex_unlock(&sbi->s_lock); return err; From b0522303f67255926b946aa66885a0104d1b2980 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Mon, 17 Mar 2025 10:53:10 +0800 Subject: [PATCH 4/7] exfat: fix the infinite loop in exfat_find_last_cluster() In exfat_find_last_cluster(), the cluster chain is traversed until the EOF cluster. If the cluster chain includes a loop due to file system corruption, the EOF cluster cannot be traversed, resulting in an infinite loop. If the number of clusters indicated by the file size is inconsistent with the cluster chain length, exfat_find_last_cluster() will return an error, so if this inconsistency is found, the traversal can be aborted without traversing to the EOF cluster. Reported-by: syzbot+f7d147e6db52b1e09dba@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f7d147e6db52b1e09dba Tested-by: syzbot+f7d147e6db52b1e09dba@syzkaller.appspotmail.com Fixes: 31023864e67a ("exfat: add fat entry operations") Signed-off-by: Yuezhang Mo Signed-off-by: Namjae Jeon --- fs/exfat/fatent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index b9473a69f104..23065f948ae7 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -294,7 +294,7 @@ int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, clu = next; if (exfat_ent_get(sb, clu, &next)) return -EIO; - } while (next != EXFAT_EOF_CLUSTER); + } while (next != EXFAT_EOF_CLUSTER && count <= p_chain->size); if (p_chain->size != count) { exfat_fs_error(sb, From 47e35366bc6fa3cf189a8305bce63992495f3efa Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 6 Mar 2025 15:02:07 +0800 Subject: [PATCH 5/7] exfat: fix missing shutdown check xfstests generic/730 test failed because after deleting the device that still had dirty data, the file could still be read without returning an error. The reason is the missing shutdown check in ->read_iter. I also noticed that shutdown checks were missing from ->write_iter, ->splice_read, and ->mmap. This commit adds shutdown checks to all of them. Fixes: f761fcdd289d ("exfat: Implement sops->shutdown and ioctl") Signed-off-by: Yuezhang Mo Signed-off-by: Namjae Jeon --- fs/exfat/file.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 807349d8ea05..841a5b18e3df 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -582,6 +582,9 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t pos = iocb->ki_pos; loff_t valid_size; + if (unlikely(exfat_forced_shutdown(inode->i_sb))) + return -EIO; + inode_lock(inode); valid_size = ei->valid_size; @@ -635,6 +638,16 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; } +static ssize_t exfat_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(exfat_forced_shutdown(inode->i_sb))) + return -EIO; + + return generic_file_read_iter(iocb, iter); +} + static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf) { int err; @@ -672,14 +685,26 @@ static const struct vm_operations_struct exfat_file_vm_ops = { static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma) { + if (unlikely(exfat_forced_shutdown(file_inode(file)->i_sb))) + return -EIO; + file_accessed(file); vma->vm_ops = &exfat_file_vm_ops; return 0; } +static ssize_t exfat_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) +{ + if (unlikely(exfat_forced_shutdown(file_inode(in)->i_sb))) + return -EIO; + + return filemap_splice_read(in, ppos, pipe, len, flags); +} + const struct file_operations exfat_file_operations = { .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = exfat_file_read_iter, .write_iter = exfat_file_write_iter, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT @@ -687,7 +712,7 @@ const struct file_operations exfat_file_operations = { #endif .mmap = exfat_file_mmap, .fsync = exfat_file_fsync, - .splice_read = filemap_splice_read, + .splice_read = exfat_splice_read, .splice_write = iter_file_splice_write, }; From 59c30e31425833385e6644ad33151420e37eabe1 Mon Sep 17 00:00:00 2001 From: Sungjong Seo Date: Wed, 26 Mar 2025 23:48:48 +0900 Subject: [PATCH 6/7] exfat: fix potential wrong error return from get_block If there is no error, get_block() should return 0. However, when bh_read() returns 1, get_block() also returns 1 in the same manner. Let's set err to 0, if there is no error from bh_read() Fixes: 11a347fb6cef ("exfat: change to get file size from DataLength") Cc: stable@vger.kernel.org Signed-off-by: Sungjong Seo Reviewed-by: Yuezhang Mo Signed-off-by: Namjae Jeon --- fs/exfat/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index f3fdba9f4d21..a23677de4544 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -391,6 +391,8 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, /* Zero unwritten part of a block */ memset(bh_result->b_data + size, 0, bh_result->b_size - size); + + err = 0; } else { /* * The range has not been written, clear the mapped flag From c73e680d1f84059e1b1ea82a537f6ccc1c563eb4 Mon Sep 17 00:00:00 2001 From: Sungjong Seo Date: Thu, 27 Mar 2025 00:01:16 +0900 Subject: [PATCH 7/7] exfat: call bh_read in get_block only when necessary With commit 11a347fb6cef ("exfat: change to get file size from DataLength"), exfat_get_block() can now handle valid_size. However, most partial unwritten blocks that could be mapped with other blocks are being inefficiently processed separately as individual blocks. Except for partial unwritten blocks that require independent processing, let's handle them simply as before. Signed-off-by: Sungjong Seo Reviewed-by: Yuezhang Mo Signed-off-by: Namjae Jeon --- fs/exfat/inode.c | 159 +++++++++++++++++++++++------------------------ 1 file changed, 77 insertions(+), 82 deletions(-) diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index a23677de4544..b22c02d6000f 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -274,9 +274,11 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, sector_t last_block; sector_t phys = 0; sector_t valid_blks; + loff_t i_size; mutex_lock(&sbi->s_lock); - last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb); + i_size = i_size_read(inode); + last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size, sb); if (iblock >= last_block && !create) goto done; @@ -305,102 +307,95 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, if (buffer_delay(bh_result)) clear_buffer_delay(bh_result); - if (create) { + /* + * In most cases, we just need to set bh_result to mapped, unmapped + * or new status as follows: + * 1. i_size == valid_size + * 2. write case (create == 1) + * 3. direct_read (!bh_result->b_folio) + * -> the unwritten part will be zeroed in exfat_direct_IO() + * + * Otherwise, in the case of buffered read, it is necessary to take + * care the last nested block if valid_size is not equal to i_size. + */ + if (i_size == ei->valid_size || create || !bh_result->b_folio) valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); + else + valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); - if (iblock + max_blocks < valid_blks) { - /* The range has been written, map it */ - goto done; - } else if (iblock < valid_blks) { - /* - * The range has been partially written, - * map the written part. - */ - max_blocks = valid_blks - iblock; - goto done; - } + /* The range has been fully written, map it */ + if (iblock + max_blocks < valid_blks) + goto done; - /* The area has not been written, map and mark as new. */ - set_buffer_new(bh_result); + /* The range has been partially written, map the written part */ + if (iblock < valid_blks) { + max_blocks = valid_blks - iblock; + goto done; + } + /* The area has not been written, map and mark as new for create case */ + if (create) { + set_buffer_new(bh_result); ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb); mark_inode_dirty(inode); - } else { - valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); + goto done; + } - if (iblock + max_blocks < valid_blks) { - /* The range has been written, map it */ - goto done; - } else if (iblock < valid_blks) { - /* - * The area has been partially written, - * map the written part. - */ - max_blocks = valid_blks - iblock; + /* + * The area has just one block partially written. + * In that case, we should read and fill the unwritten part of + * a block with zero. + */ + if (bh_result->b_folio && iblock == valid_blks && + (ei->valid_size & (sb->s_blocksize - 1))) { + loff_t size, pos; + void *addr; + + max_blocks = 1; + + /* + * No buffer_head is allocated. + * (1) bmap: It's enough to set blocknr without I/O. + * (2) read: The unwritten part should be filled with zero. + * If a folio does not have any buffers, + * let's returns -EAGAIN to fallback to + * block_read_full_folio() for per-bh IO. + */ + if (!folio_buffers(bh_result->b_folio)) { + err = -EAGAIN; goto done; - } else if (iblock == valid_blks && - (ei->valid_size & (sb->s_blocksize - 1))) { - /* - * The block has been partially written, - * zero the unwritten part and map the block. - */ - loff_t size, pos; - void *addr; - - max_blocks = 1; - - /* - * For direct read, the unwritten part will be zeroed in - * exfat_direct_IO() - */ - if (!bh_result->b_folio) - goto done; - - /* - * No buffer_head is allocated. - * (1) bmap: It's enough to fill bh_result without I/O. - * (2) read: The unwritten part should be filled with 0 - * If a folio does not have any buffers, - * let's returns -EAGAIN to fallback to - * per-bh IO like block_read_full_folio(). - */ - if (!folio_buffers(bh_result->b_folio)) { - err = -EAGAIN; - goto done; - } + } - pos = EXFAT_BLK_TO_B(iblock, sb); - size = ei->valid_size - pos; - addr = folio_address(bh_result->b_folio) + - offset_in_folio(bh_result->b_folio, pos); + pos = EXFAT_BLK_TO_B(iblock, sb); + size = ei->valid_size - pos; + addr = folio_address(bh_result->b_folio) + + offset_in_folio(bh_result->b_folio, pos); - /* Check if bh->b_data points to proper addr in folio */ - if (bh_result->b_data != addr) { - exfat_fs_error_ratelimit(sb, + /* Check if bh->b_data points to proper addr in folio */ + if (bh_result->b_data != addr) { + exfat_fs_error_ratelimit(sb, "b_data(%p) != folio_addr(%p)", bh_result->b_data, addr); - err = -EINVAL; - goto done; - } - - /* Read a block */ - err = bh_read(bh_result, 0); - if (err < 0) - goto done; + err = -EINVAL; + goto done; + } - /* Zero unwritten part of a block */ - memset(bh_result->b_data + size, 0, - bh_result->b_size - size); + /* Read a block */ + err = bh_read(bh_result, 0); + if (err < 0) + goto done; - err = 0; - } else { - /* - * The range has not been written, clear the mapped flag - * to only zero the cache and do not read from disk. - */ - clear_buffer_mapped(bh_result); - } + /* Zero unwritten part of a block */ + memset(bh_result->b_data + size, 0, bh_result->b_size - size); + err = 0; + goto done; } + + /* + * The area has not been written, clear mapped for read/bmap cases. + * If so, it will be filled with zero without reading from disk. + */ + clear_buffer_mapped(bh_result); done: bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); if (err < 0)