Skip to content

Commit

Permalink
Btrfs: fix -ENOSPC when finishing block group creation
Browse files Browse the repository at this point in the history
While creating a block group, we often end up getting ENOSPC while updating
the chunk tree, which leads to a transaction abortion that produces a trace
like the following:

[30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]()
[30670.117777] BTRFS: Transaction aborted (error -28)
(...)
[30670.163567] Call Trace:
[30670.163906]  [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[30670.164522]  [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[30670.165171]  [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[30670.166323]  [<ffffffffa035daa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.167213]  [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[30670.167862]  [<ffffffffa035daa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.169116]  [<ffffffffa03743d7>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs]
[30670.170593]  [<ffffffffa038426a>] __btrfs_end_transaction+0x84/0x366 [btrfs]
[30670.171960]  [<ffffffffa038455c>] btrfs_end_transaction+0x10/0x12 [btrfs]
[30670.174649]  [<ffffffffa036eb6b>] btrfs_check_data_free_space+0x11f/0x27c [btrfs]
[30670.176092]  [<ffffffffa039450d>] btrfs_fallocate+0x7c8/0xb96 [btrfs]
[30670.177218]  [<ffffffff812459f2>] ? __this_cpu_preempt_check+0x13/0x15
[30670.178622]  [<ffffffff81152447>] vfs_fallocate+0x14c/0x1de
[30670.179642]  [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f
[30670.180692]  [<ffffffff81152863>] SyS_fallocate+0x47/0x62
[30670.186737]  [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[30670.187792] ---[ end trace 0373e6b491c4a8cc ]---

This is because we don't do proper space reservation for the chunk block
reserve when we have multiple tasks allocating chunks in parallel.

So block group creation has 2 phases, and the first phase essentially
checks if there is enough space in the system space_info, allocating a
new system chunk if there isn't, while the second phase updates the
device, extent and chunk trees. However, because the updates to the
chunk tree happen in the second phase, if we have N tasks, each with
its own transaction handle, allocating new chunks in parallel and if
there is only enough space in the system space_info to allocate M chunks,
where M < N, none of the tasks ends up allocating a new system chunk in
the first phase and N - M tasks will get -ENOSPC when attempting to
update the chunk tree in phase 2 if they need to COW any nodes/leafs
from the chunk tree.

Fix this by doing proper reservation in the chunk block reserve.

The issue could be reproduced by running fstests generic/038 in a loop,
which eventually triggered the problem.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
  • Loading branch information
Filipe Manana authored and Chris Mason committed Jun 3, 2015
1 parent 0d2b237 commit 4fbcdf6
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 2 deletions.
1 change: 1 addition & 0 deletions fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -3458,6 +3458,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode);
void btrfs_orphan_release_metadata(struct inode *inode);
Expand Down
44 changes: 42 additions & 2 deletions fs/btrfs/extent-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -4116,11 +4116,19 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_space_info *info;
u64 left;
u64 thresh;
int ret = 0;

/*
* Needed because we can end up allocating a system chunk and for an
* atomic and race free space reservation in the chunk block reserve.
*/
ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));

info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
left = info->total_bytes - info->bytes_used - info->bytes_pinned -
info->bytes_reserved - info->bytes_readonly;
info->bytes_reserved - info->bytes_readonly -
info->bytes_may_use;
spin_unlock(&info->lock);

thresh = get_system_chunk_thresh(root, type);
Expand All @@ -4134,7 +4142,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
u64 flags;

flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
btrfs_alloc_chunk(trans, root, flags);
/*
* Ignore failure to create system chunk. We might end up not
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
*/
ret = btrfs_alloc_chunk(trans, root, flags);
}

if (!ret) {
ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
&root->fs_info->chunk_block_rsv,
thresh, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
trans->chunk_bytes_reserved += thresh;
}
}

Expand Down Expand Up @@ -5192,6 +5214,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
trans->bytes_reserved = 0;
}

/*
* To be called after all the new block groups attached to the transaction
* handle have been created (btrfs_create_pending_block_groups()).
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->root->fs_info;

if (!trans->chunk_bytes_reserved)
return;

WARN_ON_ONCE(!list_empty(&trans->new_bgs));

block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
trans->chunk_bytes_reserved);
trans->chunk_bytes_reserved = 0;
}

/* Can only return 0 or -ENOSPC */
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
Expand Down
6 changes: 6 additions & 0 deletions fs/btrfs/transaction.c
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
h->transaction = cur_trans;
h->blocks_used = 0;
h->bytes_reserved = 0;
h->chunk_bytes_reserved = 0;
h->root = root;
h->delayed_ref_updates = 0;
h->use_count = 1;
Expand Down Expand Up @@ -792,6 +793,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);

btrfs_trans_release_chunk_metadata(trans);

if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root) &&
ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
Expand Down Expand Up @@ -2054,6 +2057,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);

btrfs_trans_release_chunk_metadata(trans);

spin_lock(&root->fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
Expand Down Expand Up @@ -2123,6 +2128,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_scrub_continue(root);
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
if (trans->qgroup_reserved) {
btrfs_qgroup_free(root, trans->qgroup_reserved);
Expand Down
1 change: 1 addition & 0 deletions fs/btrfs/transaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
u64 chunk_bytes_reserved;
u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
Expand Down

0 comments on commit 4fbcdf6

Please sign in to comment.