Skip to content

Commit

Permalink
Btrfs: allow us to overcommit our enospc reservations
Browse files Browse the repository at this point in the history
One of the things that kills us is the fact that our ENOSPC reservations are
horribly over the top in most normal cases.  There isn't too much that can be
done about this because when we are completely full we really need them to work
like this so we don't under reserve.  However if there is plenty of unallocated
chunks on the disk we can use that to gauge how much we can overcommit.  So this
patch adds chunk free space accounting so we always know how much unallocated
space we have.  Then if we fail to make a reservation within our allocated
space, check to see if we can overcommit.  In the normal flushing case (like
with delalloc metadata reservations) we'll take the free space and divide it by
2 if our metadata profile is setup for DUP or any of those, and then divide it
by 8 to make sure we don't overcommit too much.  Then if we're in a non-flushing
case (we really need this reservation now!) we only limit ourselves to half of
the free space.  This makes this fio test

[torrent]
filename=torrent-test
rw=randwrite
size=4g
ioengine=sync
directory=/mnt/btrfs-test

go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB
file system.  This doesn't seem to break my other enospc tests, but could really
use some more testing as this is a super scary change.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
  • Loading branch information
Josef Bacik committed Oct 19, 2011
1 parent 8f6d7f4 commit 2bf6475
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 18 deletions.
4 changes: 4 additions & 0 deletions fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,10 @@ struct btrfs_fs_info {
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;

/* keep track of unallocated space */
spinlock_t free_chunk_lock;
u64 free_chunk_space;

struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;

Expand Down
2 changes: 2 additions & 0 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->free_chunk_lock);
mutex_init(&fs_info->reloc_mutex);

init_completion(&fs_info->kobj_unregister);
Expand Down Expand Up @@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->trans_no_join = 0;
fs_info->free_chunk_space = 0;

fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
Expand Down
61 changes: 47 additions & 14 deletions fs/btrfs/extent-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
* @block_rsv - the block_rsv we're allocating for
* @orig_bytes - the number of bytes we want
* @flush - wether or not we can flush to make our reservation
* @check - wether this is just to check if we have enough space or not
*
* This will reserve orgi_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
Expand All @@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
*/
static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes, int flush)
u64 orig_bytes, int flush, int check)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
struct btrfs_trans_handle *trans;
u64 unused;
u64 used;
u64 num_bytes = orig_bytes;
int retries = 0;
int ret = 0;
Expand Down Expand Up @@ -3459,9 +3460,9 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
}

ret = -ENOSPC;
unused = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;

/*
* The idea here is that we've not already over-reserved the block group
Expand All @@ -3470,9 +3471,8 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
* lets start flushing stuff first and then come back and try to make
* our reservation.
*/
if (unused <= space_info->total_bytes) {
unused = space_info->total_bytes - unused;
if (unused >= orig_bytes) {
if (used <= space_info->total_bytes) {
if (used + orig_bytes <= space_info->total_bytes) {
space_info->bytes_may_use += orig_bytes;
ret = 0;
} else {
Expand All @@ -3489,10 +3489,43 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
* amount plus the amount of bytes that we need for this
* reservation.
*/
num_bytes = unused - space_info->total_bytes +
num_bytes = used - space_info->total_bytes +
(orig_bytes * (retries + 1));
}

if (ret && !check) {
u64 profile = btrfs_get_alloc_profile(root, 0);
u64 avail;

spin_lock(&root->fs_info->free_chunk_lock);
avail = root->fs_info->free_chunk_space;

/*
* If we have dup, raid1 or raid10 then only half of the free
* space is actually useable.
*/
if (profile & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
avail >>= 1;

/*
* If we aren't flushing don't let us overcommit too much, say
* 1/8th of the space. If we can flush, let it overcommit up to
* 1/2 of the space.
*/
if (flush)
avail >>= 3;
else
avail >>= 1;
spin_unlock(&root->fs_info->free_chunk_lock);

if (used + orig_bytes < space_info->total_bytes + avail) {
space_info->bytes_may_use += orig_bytes;
ret = 0;
}
}

/*
* Couldn't make our reservation, save our place so while we're trying
* to reclaim space we can actually use it instead of somebody else
Expand Down Expand Up @@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
if (num_bytes == 0)
return 0;

ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 1);
return 0;
Expand Down Expand Up @@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
if (!ret)
return 0;

ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 0);
return 0;
Expand Down Expand Up @@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
spin_unlock(&BTRFS_I(inode)->lock);

ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
if (ret) {
u64 to_free = 0;
unsigned dropped;
Expand Down Expand Up @@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);

if (block_rsv->size == 0) {
ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
/*
* If we couldn't reserve metadata bytes try and use some from
* the global reserve.
Expand All @@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
return block_rsv;
if (ret) {
WARN_ON(1);
ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
if (!ret) {
return block_rsv;
} else if (ret && block_rsv != global_rsv) {
Expand Down
39 changes: 35 additions & 4 deletions fs/btrfs/volumes.c
Original file line number Diff line number Diff line change
Expand Up @@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
}
BUG_ON(ret);

if (device->bytes_used > 0)
device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
if (device->bytes_used > 0) {
u64 len = btrfs_dev_extent_length(leaf, extent);
device->bytes_used -= len;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += len;
spin_unlock(&root->fs_info->free_chunk_lock);
}
ret = btrfs_del_item(trans, root, path);

out:
Expand Down Expand Up @@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (ret)
goto error_undo;

spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space = device->total_bytes -
device->bytes_used;
spin_unlock(&root->fs_info->free_chunk_lock);

device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root, device);

Expand Down Expand Up @@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fs_devices->num_can_discard++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;

spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += device->total_bytes;
spin_unlock(&root->fs_info->free_chunk_lock);

if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;

Expand Down Expand Up @@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
lock_chunks(root);

device->total_bytes = new_size;
if (device->writeable)
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space -= diff;
spin_unlock(&root->fs_info->free_chunk_lock);
}
unlock_chunks(root);

again:
Expand Down Expand Up @@ -2257,6 +2275,9 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
device->total_bytes = old_size;
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += diff;
spin_unlock(&root->fs_info->free_chunk_lock);
unlock_chunks(root);
goto done;
}
Expand Down Expand Up @@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
index++;
}

spin_lock(&extent_root->fs_info->free_chunk_lock);
extent_root->fs_info->free_chunk_space -= (stripe_size *
map->num_stripes);
spin_unlock(&extent_root->fs_info->free_chunk_lock);

index = 0;
stripe = &chunk->stripe;
while (index < map->num_stripes) {
Expand Down Expand Up @@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root,
fill_device_from_item(leaf, dev_item, device);
device->dev_root = root->fs_info->dev_root;
device->in_fs_metadata = 1;
if (device->writeable)
if (device->writeable) {
device->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += device->total_bytes -
device->bytes_used;
spin_unlock(&root->fs_info->free_chunk_lock);
}
ret = 0;
return ret;
}
Expand Down

0 comments on commit 2bf6475

Please sign in to comment.