Skip to content

Commit

Permalink
Merge tag 'for-5.17-rc6-tag' of git://git.kernel.org/pub/scm/linux/ke…
Browse files Browse the repository at this point in the history
…rnel/git/kdave/linux

Pull btrfs fixes from David Sterba:
 "A few more fixes for various problems that have user visible effects
  or seem to be urgent:

   - fix corruption when combining DIO and non-blocking io_uring over
     multiple extents (seen on MariaDB)

   - fix relocation crash due to premature return from commit

   - fix quota deadlock between rescan and qgroup removal

   - fix item data bounds checks in tree-checker (found on a fuzzed
     image)

   - fix fsync of prealloc extents after EOF

   - add missing run of delayed items after unlink during log replay

   - don't start relocation until snapshot drop is finished

   - fix reversed condition for subpage writers locking

   - fix warning on page error"

* tag 'for-5.17-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fallback to blocking mode when doing async dio over multiple extents
  btrfs: add missing run of delayed items after unlink during log replay
  btrfs: qgroup: fix deadlock between rescan worker and remove qgroup
  btrfs: fix relocation crash due to premature return from btrfs_commit_transaction()
  btrfs: do not start relocation until in progress drops are done
  btrfs: tree-checker: use u64 for item data end to avoid overflow
  btrfs: do not WARN_ON() if we have PageError set
  btrfs: fix lost prealloc extents beyond eof after full fsync
  btrfs: subpage: fix a wrong check on subpage->writers
  • Loading branch information
Linus Torvalds committed Mar 6, 2022
2 parents f81664f + ca93e44 commit 3ee65c0
Show file tree
Hide file tree
Showing 13 changed files with 230 additions and 28 deletions.
10 changes: 10 additions & 0 deletions fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,9 @@ enum {
/* Indicate that we want the transaction kthread to commit right now. */
BTRFS_FS_COMMIT_TRANS,

/* Indicate we have half completed snapshot deletions pending. */
BTRFS_FS_UNFINISHED_DROPS,

#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
Expand Down Expand Up @@ -1106,8 +1109,15 @@ enum {
BTRFS_ROOT_QGROUP_FLUSHING,
/* We started the orphan cleanup for this root. */
BTRFS_ROOT_ORPHAN_CLEANUP,
/* This root has a drop operation that was started previously. */
BTRFS_ROOT_UNFINISHED_DROP,
};

static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
}

/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
Expand Down
10 changes: 10 additions & 0 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -3813,6 +3813,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device

set_bit(BTRFS_FS_OPEN, &fs_info->flags);

/* Kick the cleaner thread so it'll start deleting snapshots. */
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);

clear_oneshot:
btrfs_clear_oneshot_options(fs_info);
return 0;
Expand Down Expand Up @@ -4538,6 +4542,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
kthread_park(fs_info->cleaner_kthread);

/*
* If we had UNFINISHED_DROPS we could still be processing them, so
* clear that bit and wake up relocation so it can stop.
*/
btrfs_wake_unfinished_drop(fs_info);

/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);

Expand Down
10 changes: 10 additions & 0 deletions fs/btrfs/extent-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -5622,6 +5622,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
int ret;
int level;
bool root_dropped = false;
bool unfinished_drop = false;

btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);

Expand Down Expand Up @@ -5664,6 +5665,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
* already dropped.
*/
set_bit(BTRFS_ROOT_DELETING, &root->state);
unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);

if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
Expand Down Expand Up @@ -5838,6 +5841,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
kfree(wc);
btrfs_free_path(path);
out:
/*
* We were an unfinished drop root, check to see if there are any
* pending, and if not clear and wake up any waiters.
*/
if (!err && unfinished_drop)
btrfs_maybe_wake_unfinished_drop(fs_info);

/*
* So if we need to stop dropping the snapshot for whatever reason we
* need to make sure to add it back to the dead root list so that we
Expand Down
16 changes: 13 additions & 3 deletions fs/btrfs/extent_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -6841,14 +6841,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
{
struct btrfs_fs_info *fs_info = eb->fs_info;

/*
* If we are using the commit root we could potentially clear a page
* Uptodate while we're using the extent buffer that we've previously
* looked up. We don't want to complain in this case, as the page was
* valid before, we just didn't write it out. Instead we want to catch
* the case where we didn't actually read the block properly, which
* would have !PageUptodate && !PageError, as we clear PageError before
* reading.
*/
if (fs_info->sectorsize < PAGE_SIZE) {
bool uptodate;
bool uptodate, error;

uptodate = btrfs_subpage_test_uptodate(fs_info, page,
eb->start, eb->len);
WARN_ON(!uptodate);
error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
WARN_ON(!uptodate && !error);
} else {
WARN_ON(!PageUptodate(page));
WARN_ON(!PageUptodate(page) && !PageError(page));
}
}

Expand Down
28 changes: 28 additions & 0 deletions fs/btrfs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -7600,6 +7600,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
}

len = min(len, em->len - (start - em->start));

/*
* If we have a NOWAIT request and the range contains multiple extents
* (or a mix of extents and holes), then we return -EAGAIN to make the
* caller fallback to a context where it can do a blocking (without
* NOWAIT) request. This way we avoid doing partial IO and returning
* success to the caller, which is not optimal for writes and for reads
* it can result in unexpected behaviour for an application.
*
* When doing a read, because we use IOMAP_DIO_PARTIAL when calling
* iomap_dio_rw(), we can end up returning less data then what the caller
* asked for, resulting in an unexpected, and incorrect, short read.
* That is, the caller asked to read N bytes and we return less than that,
* which is wrong unless we are crossing EOF. This happens if we get a
* page fault error when trying to fault in pages for the buffer that is
* associated to the struct iov_iter passed to iomap_dio_rw(), and we
* have previously submitted bios for other extents in the range, in
* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
* those bios have completed by the time we get the page fault error,
* which we return back to our caller - we should only return EIOCBQUEUED
* after we have submitted bios for all the extents in the range.
*/
if ((flags & IOMAP_NOWAIT) && len < length) {
free_extent_map(em);
ret = -EAGAIN;
goto unlock_err;
}

if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
start, len);
Expand Down
9 changes: 8 additions & 1 deletion fs/btrfs/qgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1196,14 +1196,21 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
if (!fs_info->quota_root)
goto out;

/*
* Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
* complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
* to lock that mutex while holding a transaction handle and the rescan
* worker needs to commit a transaction.
*/
mutex_unlock(&fs_info->qgroup_ioctl_lock);

/*
* Request qgroup rescan worker to complete and wait for it. This wait
* must be done before transaction start for quota disable since it may
* deadlock with transaction by the qgroup rescan worker.
*/
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
mutex_unlock(&fs_info->qgroup_ioctl_lock);

/*
* 1 For the root item
Expand Down
13 changes: 13 additions & 0 deletions fs/btrfs/relocation.c
Original file line number Diff line number Diff line change
Expand Up @@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;

/*
* This only gets set if we had a half-deleted snapshot on mount. We
* cannot allow relocation to start while we're still trying to clean up
* these pending deletions.
*/
ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
if (ret)
return ret;

/* We may have been woken up by close_ctree, so bail if we're closing. */
if (btrfs_fs_closing(fs_info))
return -EINTR;

bg = btrfs_lookup_block_group(fs_info, group_start);
if (!bg)
return -ENOENT;
Expand Down
15 changes: 15 additions & 0 deletions fs/btrfs/root-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)

WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
struct btrfs_key drop_key;

btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
/*
* If we have a non-zero drop_progress then we know we
* made it partly through deleting this snapshot, and
* thus we need to make sure we block any balance from
* happening until this snapshot is completely dropped.
*/
if (drop_key.objectid != 0 || drop_key.type != 0 ||
drop_key.offset != 0) {
set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
}

set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
Expand Down
2 changes: 1 addition & 1 deletion fs/btrfs/subpage.c
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
* Since we own the page lock, no one else could touch subpage::writers
* and we are safe to do several atomic operations without spinlock.
*/
if (atomic_read(&subpage->writers))
if (atomic_read(&subpage->writers) == 0)
/* No writers, locked by plain lock_page() */
return unlock_page(page);

Expand Down
65 changes: 63 additions & 2 deletions fs/btrfs/transaction.c
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
static noinline void wait_for_commit(struct btrfs_transaction *commit,
const enum btrfs_trans_state min_state)
{
wait_event(commit->commit_wait, commit->state >= min_state);
struct btrfs_fs_info *fs_info = commit->fs_info;
u64 transid = commit->transid;
bool put = false;

while (1) {
wait_event(commit->commit_wait, commit->state >= min_state);
if (put)
btrfs_put_transaction(commit);

if (min_state < TRANS_STATE_COMPLETED)
break;

/*
* A transaction isn't really completed until all of the
* previous transactions are completed, but with fsync we can
* end up with SUPER_COMMITTED transactions before a COMPLETED
* transaction. Wait for those.
*/

spin_lock(&fs_info->trans_lock);
commit = list_first_entry_or_null(&fs_info->trans_list,
struct btrfs_transaction,
list);
if (!commit || commit->transid > transid) {
spin_unlock(&fs_info->trans_lock);
break;
}
refcount_inc(&commit->use_count);
put = true;
spin_unlock(&fs_info->trans_lock);
}
}

int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
Expand Down Expand Up @@ -1319,6 +1349,32 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
return 0;
}

/*
* If we had a pending drop we need to see if there are any others left in our
* dead roots list, and if not clear our bit and wake any waiters.
*/
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
/*
* We put the drop in progress roots at the front of the list, so if the
* first entry doesn't have UNFINISHED_DROP set we can wake everybody
* up.
*/
spin_lock(&fs_info->trans_lock);
if (!list_empty(&fs_info->dead_roots)) {
struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
struct btrfs_root,
root_list);
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
spin_unlock(&fs_info->trans_lock);
return;
}
}
spin_unlock(&fs_info->trans_lock);

btrfs_wake_unfinished_drop(fs_info);
}

/*
* dead roots are old snapshots that need to be deleted. This allocates
* a dirty root struct and adds it into the list of dead roots that need to
Expand All @@ -1331,7 +1387,12 @@ void btrfs_add_dead_root(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (list_empty(&root->root_list)) {
btrfs_grab_root(root);
list_add_tail(&root->root_list, &fs_info->dead_roots);

/* We want to process the partially complete drops first. */
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
list_add(&root->root_list, &fs_info->dead_roots);
else
list_add_tail(&root->root_list, &fs_info->dead_roots);
}
spin_unlock(&fs_info->trans_lock);
}
Expand Down
1 change: 1 addition & 0 deletions fs/btrfs/transaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);

void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
Expand Down
18 changes: 9 additions & 9 deletions fs/btrfs/tree-checker.c
Original file line number Diff line number Diff line change
Expand Up @@ -1682,6 +1682,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
*/
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
u64 item_data_end;
int ret;

btrfs_item_key_to_cpu(leaf, &key, slot);
Expand All @@ -1696,6 +1697,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
return -EUCLEAN;
}

item_data_end = (u64)btrfs_item_offset(leaf, slot) +
btrfs_item_size(leaf, slot);
/*
* Make sure the offset and ends are right, remember that the
* item data starts at the end of the leaf and grows towards the
Expand All @@ -1706,11 +1709,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
else
item_end_expected = btrfs_item_offset(leaf,
slot - 1);
if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) {
if (unlikely(item_data_end != item_end_expected)) {
generic_err(leaf, slot,
"unexpected item end, have %u expect %u",
btrfs_item_data_end(leaf, slot),
item_end_expected);
"unexpected item end, have %llu expect %u",
item_data_end, item_end_expected);
return -EUCLEAN;
}

Expand All @@ -1719,12 +1721,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
if (unlikely(btrfs_item_data_end(leaf, slot) >
BTRFS_LEAF_DATA_SIZE(fs_info))) {
if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) {
generic_err(leaf, slot,
"slot end outside of leaf, have %u expect range [0, %u]",
btrfs_item_data_end(leaf, slot),
BTRFS_LEAF_DATA_SIZE(fs_info));
"slot end outside of leaf, have %llu expect range [0, %u]",
item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}

Expand Down
Loading

0 comments on commit 3ee65c0

Please sign in to comment.