Skip to content

Commit

Permalink
btrfs: reserve space for delayed refs on a per ref basis
Browse files Browse the repository at this point in the history
Currently when reserving space for delayed refs we do it on a per ref head
basis. This is generally enough because most back refs for an extent end
up being inlined in the extent item - with the default leaf size of 16K we
can have at most 33 inline back refs (this is calculated by the macro
BTRFS_MAX_EXTENT_ITEM_SIZE()). The amount of bytes reserved for each ref
head is given by btrfs_calc_delayed_ref_bytes(), which basically
corresponds to a single path for insertion into the extent tree plus
another path for insertion into the free space tree if it's enabled.

However if we have reached the limit of inline refs or we have a mix of
inline and non-inline refs, then we will need to insert a non-inline ref
and update the existing extent item to update the total number of
references for the extent. This implies we need reserved space for two
insertion paths in the extent tree, but we only reserved for one path.
The extent item and the non-inline ref item may be located in different
leaves, or even if they are located in the same leaf, after updating the
extent item and before inserting the non-inline ref item, the extent
buffers in the btree path may have been written (due to memory pressure
for e.g.), in which case we need to COW the entire path again. In this
case since we have not reserved enough space for the delayed refs block
reserve, we will use the global block reserve.

If we are in a situation where the fs has no more unallocated space enough
to allocate a new metadata block group and available space in the existing
metadata block groups is close to the maximum size of the global block
reserve (512M), we may end up consuming too much of the free metadata
space to the point where we can't commit any future transaction because it
will fail, with -ENOSPC, during its commit when trying to allocate an
extent for some COW operation (running delayed refs generated by running
delayed refs or COWing the root tree's root node at commit_cowonly_roots()
for example). Such dramatic scenario can happen if we have many delayed
refs that require the insertion of non-inline ref items, due to too many
reflinks or snapshots. We also have situations where we use the global
block reserve because we could not in advance know that we will need
space to update some trees (block group creation for example), so this
all adds up to increase the chances of exhausting the global block reserve
and making any future transaction commit to fail with -ENOSPC and turn
the fs into RO mode, or fail the mount operation in case the mount needs
to start and commit a transaction, such as when we have orphans to cleanup
for example - such case was reported and hit by someone running a SLE
(SUSE Linux Enterprise) distribution for example - where the fs had no
more unallocated space that could be used to allocate a new metadata block
group, and the available metadata space was about 1.5M, not enough to
commit a transaction to cleanup an orphan inode (or do relocation of data
block groups that were far from being full).

So reserve space for delayed refs by individual refs and not by ref heads,
as we may need to COW multiple extent tree paths due to non-inline ref
items.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
  • Loading branch information
Filipe Manana authored and David Sterba committed Oct 12, 2023
1 parent 8a526c4 commit 3ee56a5
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 24 deletions.
32 changes: 22 additions & 10 deletions fs/btrfs/delayed-ref.c
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
return 0;
}

static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
Expand All @@ -433,9 +434,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
list_del(&ref->add_list);
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
btrfs_delayed_refs_rsv_release(fs_info, 1);
}

static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
static bool merge_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref,
u64 seq)
Expand Down Expand Up @@ -464,10 +467,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
mod = -next->ref_mod;
}

drop_delayed_ref(delayed_refs, head, next);
drop_delayed_ref(fs_info, delayed_refs, head, next);
ref->ref_mod += mod;
if (ref->ref_mod == 0) {
drop_delayed_ref(delayed_refs, head, ref);
drop_delayed_ref(fs_info, delayed_refs, head, ref);
done = true;
} else {
/*
Expand Down Expand Up @@ -505,7 +508,7 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
if (merge_ref(delayed_refs, head, ref, seq))
if (merge_ref(fs_info, delayed_refs, head, ref, seq))
goto again;
}
}
Expand Down Expand Up @@ -584,10 +587,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
* Return true if the ref was merged into an existing one (and therefore can be
* freed by the caller).
*/
static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *ref)
{
struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs;
struct btrfs_delayed_ref_node *exist;
int mod;

Expand All @@ -598,6 +602,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
spin_unlock(&href->lock);
trans->delayed_ref_updates++;
return false;
}

Expand Down Expand Up @@ -626,7 +631,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,

/* remove existing tail if its ref_mod is zero */
if (exist->ref_mod == 0)
drop_delayed_ref(root, href, exist);
drop_delayed_ref(trans->fs_info, root, href, exist);
spin_unlock(&href->lock);
return true;
}
Expand Down Expand Up @@ -695,6 +700,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
/*
* If we are going to from a positive ref mod to a negative or vice
* versa we need to make sure to adjust pending_csums accordingly.
* We reserve bytes for csum deletion when adding or updating a ref head
* see add_delayed_ref_head() for more details.
*/
if (existing->is_data) {
u64 csum_leaves =
Expand Down Expand Up @@ -819,6 +826,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
/*
* We reserve the amount of bytes needed to delete csums when
* adding the ref head and not when adding individual drop refs
* since the csum items are deleted only after running the last
* delayed drop ref (the data extent's ref count drops to 0).
*/
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
Expand All @@ -828,7 +841,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
trans->delayed_ref_updates++;
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
Expand Down Expand Up @@ -958,7 +970,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);

merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
merged = insert_delayed_ref(trans, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);

/*
Expand Down Expand Up @@ -1050,7 +1062,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);

merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
merged = insert_delayed_ref(trans, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);

/*
Expand Down
1 change: 1 addition & 0 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -4563,6 +4563,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
list_del(&ref->add_list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
btrfs_delayed_refs_rsv_release(fs_info, 1);
}
if (head->must_insert_reserved)
pin_bytes = true;
Expand Down
29 changes: 15 additions & 14 deletions fs/btrfs/extent-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -1819,22 +1819,24 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
int nr_items = 1; /* Dropping this ref head update. */

/*
* We had csum deletions accounted for in our delayed refs rsv, we need
* to drop the csum leaves for this update from our delayed_refs_rsv.
*/
if (head->total_ref_mod < 0 && head->is_data) {
int nr_items;

spin_lock(&delayed_refs->lock);
delayed_refs->pending_csums -= head->num_bytes;
spin_unlock(&delayed_refs->lock);
nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
}
nr_items = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);

btrfs_delayed_refs_rsv_release(fs_info, nr_items);

btrfs_delayed_refs_rsv_release(fs_info, nr_items);
return btrfs_calc_delayed_ref_bytes(fs_info, nr_items);
}

return btrfs_calc_delayed_ref_bytes(fs_info, nr_items);
return 0;
}

static int cleanup_ref_head(struct btrfs_trans_handle *trans,
Expand Down Expand Up @@ -1884,7 +1886,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
}
}

*bytes_released = btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
*bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);

trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
Expand Down Expand Up @@ -1926,7 +1928,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
}

static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *locked_ref)
struct btrfs_delayed_ref_head *locked_ref,
u64 *bytes_released)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
Expand Down Expand Up @@ -1982,7 +1985,8 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,

ret = run_one_delayed_ref(trans, ref, extent_op,
must_insert_reserved);

btrfs_delayed_refs_rsv_release(fs_info, 1);
*bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
unselect_delayed_ref_head(delayed_refs, locked_ref);
Expand Down Expand Up @@ -2048,22 +2052,19 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_lock(&locked_ref->lock);
btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);

ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
if (ret < 0 && ret != -EAGAIN) {
/*
* Error, btrfs_run_delayed_refs_for_head already
* unlocked everything so just bail out
*/
return ret;
} else if (!ret) {
u64 bytes_released = 0;

/*
* Success, perform the usual cleanup of a processed
* head
*/
ret = cleanup_ref_head(trans, locked_ref, &bytes_released);
bytes_processed += bytes_released;
ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
if (ret > 0 ) {
/* We dropped our lock, we need to loop. */
ret = 0;
Expand Down

0 comments on commit 3ee56a5

Please sign in to comment.