Skip to content

Commit

Permalink
Merge tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/ke…
Browse files Browse the repository at this point in the history
…rnel/git/kdave/linux

Pull btrfs zoned mode fixes from David Sterba:

 - fix deadlock when allocating system chunk

 - fix wrong mutex unlock on an error path

 - fix extent map splitting for append operation

 - update and fix message reporting unusable chunk space

 - don't block when background zone reclaim runs with balance in
   parallel

* tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: zoned: fix wrong mutex unlock on failure to allocate log root tree
  btrfs: don't block if we can't acquire the reclaim lock
  btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
  btrfs: rework chunk allocation to avoid exhaustion of the system chunk array
  btrfs: fix deadlock with concurrent chunk allocations involving system chunks
  btrfs: zoned: print unusable percentage when reclaiming block groups
  btrfs: zoned: fix types for u64 division in btrfs_reclaim_bgs_work
  • Loading branch information
Linus Torvalds committed Jul 13, 2021
2 parents 7fef2ed + ea32af4 commit f02bf85
Show file tree
Hide file tree
Showing 9 changed files with 687 additions and 286 deletions.
367 changes: 271 additions & 96 deletions fs/btrfs/block-group.c

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions fs/btrfs/block-group.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ struct btrfs_block_group {
unsigned int removed:1;
unsigned int to_copy:1;
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;

int disk_cache_state;

Expand Down Expand Up @@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 bytes_used, u64 type,
u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
Expand Down
67 changes: 13 additions & 54 deletions fs/btrfs/ctree.c
Original file line number Diff line number Diff line change
Expand Up @@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return 0;
}

static struct extent_buffer *alloc_tree_block_no_bg_flush(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent_start,
const struct btrfs_disk_key *disk_key,
int level,
u64 hint,
u64 empty_size,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *ret;

/*
* If we are COWing a node/leaf from the extent, chunk, device or free
* space trees, make sure that we do not finish block group creation of
* pending block groups. We do this to avoid a deadlock.
* COWing can result in allocation of a new chunk, and flushing pending
* block groups (btrfs_create_pending_block_groups()) can be triggered
* when finishing allocation of a new chunk. Creation of a pending block
* group modifies the extent, chunk, device and free space trees,
* therefore we could deadlock with ourselves since we are holding a
* lock on an extent buffer that btrfs_create_pending_block_groups() may
* try to COW later.
* For similar reasons, we also need to delay flushing pending block
* groups when splitting a leaf or node, from one of those trees, since
* we are holding a write lock on it and its parent or when inserting a
* new root node for one of those trees.
*/
if (root == fs_info->extent_root ||
root == fs_info->chunk_root ||
root == fs_info->dev_root ||
root == fs_info->free_space_root)
trans->can_flush_pending_bgs = false;

ret = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, disk_key, level,
hint, empty_size, nest);
trans->can_flush_pending_bgs = true;

return ret;
}

/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
Expand Down Expand Up @@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;

cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
level, search_start, empty_size, nest);
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);

Expand Down Expand Up @@ -2458,9 +2416,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);

c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
root->node->start, 0,
BTRFS_NESTING_NEW_ROOT);
c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&lower_key, level, root->node->start, 0,
BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);

Expand Down Expand Up @@ -2589,8 +2547,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);

split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
c->start, 0, BTRFS_NESTING_SPLIT);
split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&disk_key, level, c->start, 0,
BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);

Expand Down Expand Up @@ -3381,10 +3340,10 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
* use BTRFS_NESTING_NEW_ROOT.
*/
right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
l->start, 0, num_doubles ?
BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&disk_key, 0, l->start, 0,
num_doubles ? BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);

Expand Down
147 changes: 118 additions & 29 deletions fs/btrfs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -2271,13 +2271,127 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}

/*
* Split an extent_map at [start, start + len]
*
* This function is intended to be used only for extract_ordered_extent().
*/
static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
u64 pre, u64 post)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
struct extent_map *split_pre = NULL;
struct extent_map *split_mid = NULL;
struct extent_map *split_post = NULL;
int ret = 0;
int modified;
unsigned long flags;

/* Sanity check */
if (pre == 0 && post == 0)
return 0;

split_pre = alloc_extent_map();
if (pre)
split_mid = alloc_extent_map();
if (post)
split_post = alloc_extent_map();
if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
ret = -ENOMEM;
goto out;
}

ASSERT(pre + post < len);

lock_extent(&inode->io_tree, start, start + len - 1);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
ret = -EIO;
goto out_unlock;
}

ASSERT(em->len == len);
ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);

flags = em->flags;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &flags);
modified = !list_empty(&em->list);

/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
split_pre->len = (pre ? pre : em->len - post);
split_pre->orig_start = split_pre->start;
split_pre->block_start = em->block_start;
split_pre->block_len = split_pre->len;
split_pre->orig_block_len = split_pre->block_len;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;

replace_extent_mapping(em_tree, em, split_pre, modified);

/*
* Now we only have an extent_map at:
* [em->start, em->start + pre] if pre != 0
* [em->start, em->start + em->len - post] if pre == 0
*/

if (pre) {
/* Insert the middle extent_map */
split_mid->start = em->start + pre;
split_mid->len = em->len - pre - post;
split_mid->orig_start = split_mid->start;
split_mid->block_start = em->block_start + pre;
split_mid->block_len = split_mid->len;
split_mid->orig_block_len = split_mid->block_len;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
add_extent_mapping(em_tree, split_mid, modified);
}

if (post) {
split_post->start = em->start + em->len - post;
split_post->len = post;
split_post->orig_start = split_post->start;
split_post->block_start = em->block_start + em->len - post;
split_post->block_len = split_post->len;
split_post->orig_block_len = split_post->block_len;
split_post->ram_bytes = split_post->len;
split_post->flags = flags;
split_post->compress_type = em->compress_type;
split_post->generation = em->generation;
add_extent_mapping(em_tree, split_post, modified);
}

/* Once for us */
free_extent_map(em);
/* Once for the tree */
free_extent_map(em);

out_unlock:
write_unlock(&em_tree->lock);
unlock_extent(&inode->io_tree, start, start + len - 1);
out:
free_extent_map(split_pre);
free_extent_map(split_mid);
free_extent_map(split_post);

return ret;
}

static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
struct bio *bio, loff_t file_offset)
{
struct btrfs_ordered_extent *ordered;
struct extent_map *em = NULL, *em_new = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 file_len;
u64 len = bio->bi_iter.bi_size;
u64 end = start + len;
u64 ordered_end;
Expand Down Expand Up @@ -2317,41 +2431,16 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
goto out;
}

file_len = ordered->num_bytes;
pre = start - ordered->disk_bytenr;
post = ordered_end - end;

ret = btrfs_split_ordered_extent(ordered, pre, post);
if (ret)
goto out;

read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
if (!em) {
read_unlock(&em_tree->lock);
ret = -EIO;
goto out;
}
read_unlock(&em_tree->lock);

ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
/*
* We cannot reuse em_new here but have to create a new one, as
* unpin_extent_cache() expects the start of the extent map to be the
* logical offset of the file, which does not hold true anymore after
* splitting.
*/
em_new = create_io_em(inode, em->start + pre, len,
em->start + pre, em->block_start + pre, len,
len, len, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_REGULAR);
if (IS_ERR(em_new)) {
ret = PTR_ERR(em_new);
goto out;
}
free_extent_map(em_new);
ret = split_zoned_em(inode, file_offset, file_len, pre, post);

out:
free_extent_map(em);
btrfs_put_ordered_extent(ordered);

return errno_to_blk_status(ret);
Expand Down
15 changes: 5 additions & 10 deletions fs/btrfs/transaction.c
Original file line number Diff line number Diff line change
Expand Up @@ -254,23 +254,21 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
}

/*
* To be called after all the new block groups attached to the transaction
* handle have been created (btrfs_create_pending_block_groups()).
* To be called after doing the chunk btree updates right after allocating a new
* chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
* chunk after all chunk btree updates and after finishing the second phase of
* chunk allocation (btrfs_create_pending_block_groups()) in case some block
* group had its chunk item insertion delayed to the second phase.
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;

if (!trans->chunk_bytes_reserved)
return;

WARN_ON_ONCE(!list_empty(&trans->new_bgs));

btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved, NULL);
atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
cond_wake_up(&cur_trans->chunk_reserve_wait);
trans->chunk_bytes_reserved = 0;
}

Expand Down Expand Up @@ -386,8 +384,6 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
spin_lock_init(&cur_trans->dropped_roots_lock);
INIT_LIST_HEAD(&cur_trans->releasing_ebs);
spin_lock_init(&cur_trans->releasing_ebs_lock);
atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
init_waitqueue_head(&cur_trans->chunk_reserve_wait);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
Expand Down Expand Up @@ -701,7 +697,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
h->fs_info = root->fs_info;

h->type = type;
h->can_flush_pending_bgs = true;
INIT_LIST_HEAD(&h->new_bgs);

smp_mb();
Expand Down
9 changes: 1 addition & 8 deletions fs/btrfs/transaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,6 @@ struct btrfs_transaction {

spinlock_t releasing_ebs_lock;
struct list_head releasing_ebs;

/*
* The number of bytes currently reserved, by all transaction handles
* attached to this transaction, for metadata extents of the chunk tree.
*/
atomic64_t chunk_bytes_reserved;
wait_queue_head_t chunk_reserve_wait;
};

#define __TRANS_FREEZABLE (1U << 0)
Expand Down Expand Up @@ -139,7 +132,7 @@ struct btrfs_trans_handle {
short aborted;
bool adding_csums;
bool allocating_chunk;
bool can_flush_pending_bgs;
bool removing_chunk;
bool reloc_reserved;
bool in_fsync;
struct btrfs_root *root;
Expand Down
2 changes: 1 addition & 1 deletion fs/btrfs/tree-log.c
Original file line number Diff line number Diff line change
Expand Up @@ -3173,7 +3173,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->tree_root->log_mutex);
goto out;
}
}
Expand Down
Loading

0 comments on commit f02bf85

Please sign in to comment.