Skip to content

Commit

Permalink
btrfs: use a dedicated data structure for chunk maps
Browse files Browse the repository at this point in the history
Currently we abuse the extent_map structure for two purposes:

1) To actually represent extents for inodes;
2) To represent chunk mappings.

This is odd and has several disadvantages:

1) To create a chunk map, we need to do two memory allocations: one for
   an extent_map structure and another one for a map_lookup structure, so
   more potential for an allocation failure and more complicated code to
   manage and link two structures;

2) For a chunk map we actually only use 3 fields (24 bytes) of the
   respective extent map structure: the 'start' field to have the logical
   start address of the chunk, the 'len' field to have the chunk's size,
   and the 'orig_block_len' field to contain the chunk's stripe size.

   Besides wasting a memory, it's also odd and not intuitive at all to
   have the stripe size in a field named 'orig_block_len'.

   We are also using 'block_len' of the extent_map structure to contain
   the chunk size, so we have 2 fields for the same value, 'len' and
   'block_len', which is pointless;

3) When an extent map is associated to a chunk mapping, we set the bit
   EXTENT_FLAG_FS_MAPPING on its flags and then make its member named
   'map_lookup' point to the associated map_lookup structure. This means
   that for an extent map associated to an inode extent, we are not using
   this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform);

4) Extent maps associated to a chunk mapping are never merged or split so
   it's pointless to use the existing extent map infrastructure.

So add a dedicated data structure named 'btrfs_chunk_map' to represent
chunk mappings, this is basically the existing map_lookup structure with
some extra fields:

1) 'start' to contain the chunk logical address;
2) 'chunk_len' to contain the chunk's length;
3) 'stripe_size' for the stripe size;
4) 'rb_node' for insertion into a rb tree;
5) 'refs' for reference counting.

This way we do a single memory allocation for chunk mappings and we don't
waste memory for them with unused/unnecessary fields from an extent_map.

We also save 8 bytes from the extent_map structure by removing the
'map_lookup' pointer, so the size of struct extent_map is reduced from
144 bytes down to 136 bytes, and we can now have 30 extents map per 4K
page instead of 28.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
  • Loading branch information
Filipe Manana authored and David Sterba committed Dec 15, 2023
1 parent ebb0bec commit 7dc66ab
Show file tree
Hide file tree
Showing 17 changed files with 506 additions and 494 deletions.
165 changes: 68 additions & 97 deletions fs/btrfs/block-group.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
cache);

kfree(cache->free_space_ctl);
kfree(cache->physical_map);
btrfs_free_chunk_map(cache->physical_map);
kfree(cache);
}
}
Expand Down Expand Up @@ -1047,7 +1047,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
}

int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em)
struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
Expand All @@ -1059,10 +1059,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int index;
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
bool remove_em;
bool remove_map;
bool remove_rsv = false;

block_group = btrfs_lookup_block_group(fs_info, group_start);
block_group = btrfs_lookup_block_group(fs_info, map->start);
BUG_ON(!block_group);
BUG_ON(!block_group->ro);

Expand Down Expand Up @@ -1252,7 +1252,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* entries because we already removed them all when we called
* btrfs_remove_free_space_cache().
*
* And we must not remove the extent map from the fs_info->mapping_tree
* And we must not remove the chunk map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
* ranges from being reused for a new block group. This is needed to
* avoid races with trimming and scrub.
Expand All @@ -1268,19 +1268,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
remove_em = (atomic_read(&block_group->frozen) == 0);
remove_map = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);

if (remove_em) {
struct extent_map_tree *em_tree;

em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for the tree */
free_extent_map(em);
}
if (remove_map)
btrfs_remove_chunk_map(fs_info, map);

out:
/* Once for the lookup reference */
Expand All @@ -1295,16 +1287,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
struct btrfs_chunk_map *map;
unsigned int num_items;

read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
ASSERT(em != NULL);
ASSERT(em->start == chunk_offset);
map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
ASSERT(map != NULL);
ASSERT(map->start == chunk_offset);

/*
* We need to reserve 3 + N units from the metadata space info in order
Expand All @@ -1325,9 +1313,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
map = em->map_lookup;
num_items = 3 + map->num_stripes;
free_extent_map(em);
btrfs_free_chunk_map(map);

return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
Expand Down Expand Up @@ -1928,8 +1915,7 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_chunk_map *map;
struct btrfs_block_group_item bg;
struct extent_buffer *leaf;
int slot;
Expand All @@ -1939,40 +1925,37 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
slot = path->slots[0];
leaf = path->nodes[0];

em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
read_unlock(&em_tree->lock);
if (!em) {
map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
if (!map) {
btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
key->objectid, key->offset);
return -ENOENT;
}

if (em->start != key->objectid || em->len != key->offset) {
if (map->start != key->objectid || map->chunk_len != key->offset) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
key->objectid, key->offset, em->start, em->len);
key->objectid, key->offset, map->start, map->chunk_len);
ret = -EUCLEAN;
goto out_free_em;
goto out_free_map;
}

read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
sizeof(bg));
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;

if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
(BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
(BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
ret = -EUCLEAN;
}

out_free_em:
free_extent_map(em);
out_free_map:
btrfs_free_chunk_map(map);
return ret;
}

Expand Down Expand Up @@ -2025,23 +2008,21 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
struct btrfs_chunk_map *map;
u64 *buf;
u64 bytenr;
u64 data_stripe_length;
u64 io_stripe_size;
int i, nr = 0;
int ret = 0;

em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
if (IS_ERR(em))
map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
if (IS_ERR(map))
return -EIO;

map = em->map_lookup;
data_stripe_length = em->orig_block_len;
data_stripe_length = map->stripe_size;
io_stripe_size = BTRFS_STRIPE_LEN;
chunk_start = em->start;
chunk_start = map->start;

/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
Expand Down Expand Up @@ -2095,7 +2076,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
*naddrs = nr;
*stripe_len = io_stripe_size;
out:
free_extent_map(em);
btrfs_free_chunk_map(map);
return ret;
}

Expand Down Expand Up @@ -2200,49 +2181,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
*/
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;

while (1) {
read_lock(&map_tree->lock);
struct btrfs_chunk_map *map;
struct btrfs_block_group *bg;

/*
* lookup_extent_mapping will return the first extent map
* intersecting the range, so setting @len to 1 is enough to
* btrfs_find_chunk_map() will return the first chunk map
* intersecting the range, so setting @length to 1 is enough to
* get the first chunk.
*/
em = lookup_extent_mapping(map_tree, start, 1);
read_unlock(&map_tree->lock);
if (!em)
map = btrfs_find_chunk_map(fs_info, start, 1);
if (!map)
break;

bg = btrfs_lookup_block_group(fs_info, em->start);
bg = btrfs_lookup_block_group(fs_info, map->start);
if (!bg) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
em->start, em->len);
map->start, map->chunk_len);
ret = -EUCLEAN;
free_extent_map(em);
btrfs_free_chunk_map(map);
break;
}
if (bg->start != em->start || bg->length != em->len ||
if (bg->start != map->start || bg->length != map->chunk_len ||
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
(em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
(map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
em->start, em->len,
em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
map->start, map->chunk_len,
map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
free_extent_map(em);
btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
break;
}
start = em->start + em->len;
free_extent_map(em);
start = map->start + map->chunk_len;
btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
}
return ret;
Expand Down Expand Up @@ -2370,28 +2349,25 @@ static int read_one_block_group(struct btrfs_fs_info *info,

static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct rb_node *node;
int ret = 0;

for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
struct extent_map *em;
struct map_lookup *map;
for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
struct btrfs_chunk_map *map;
struct btrfs_block_group *bg;

em = rb_entry(node, struct extent_map, rb_node);
map = em->map_lookup;
bg = btrfs_create_block_group_cache(fs_info, em->start);
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
bg = btrfs_create_block_group_cache(fs_info, map->start);
if (!bg) {
ret = -ENOMEM;
break;
}

/* Fill dummy cache as FULL */
bg->length = em->len;
bg->length = map->chunk_len;
bg->flags = map->type;
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = em->len;
bg->used = map->chunk_len;
bg->flags = map->type;
ret = btrfs_add_block_group_cache(fs_info, bg);
/*
Expand Down Expand Up @@ -2619,19 +2595,17 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_device *device;
struct extent_map *em;
struct map_lookup *map;
struct btrfs_chunk_map *map;
u64 dev_offset;
u64 stripe_size;
int i;
int ret = 0;

em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
if (IS_ERR(em))
return PTR_ERR(em);
map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
if (IS_ERR(map))
return PTR_ERR(map);

map = em->map_lookup;
stripe_size = em->orig_block_len;
stripe_size = map->stripe_size;

/*
* Take the device list mutex to prevent races with the final phase of
Expand All @@ -2654,7 +2628,7 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);

free_extent_map(em);
btrfs_free_chunk_map(map);
return ret;
}

Expand Down Expand Up @@ -4407,8 +4381,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache)
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_map_tree *em_tree;
struct extent_map *em;
bool cleanup;

spin_lock(&block_group->lock);
Expand All @@ -4417,17 +4389,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_unlock(&block_group->lock);

if (cleanup) {
em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, block_group->start,
1);
BUG_ON(!em); /* logic error, can't happen */
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);

/* once for us and once for the tree */
free_extent_map(em);
free_extent_map(em);
struct btrfs_chunk_map *map;

map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
/* Logic error, can't happen. */
ASSERT(map);

btrfs_remove_chunk_map(fs_info, map);

/* Once for our lookup reference. */
btrfs_free_chunk_map(map);

/*
* We may have left one free space entry and other possible
Expand Down
6 changes: 4 additions & 2 deletions fs/btrfs/block-group.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include "free-space-cache.h"

struct btrfs_chunk_map;

enum btrfs_disk_cache_state {
BTRFS_DC_WRITTEN,
BTRFS_DC_ERROR,
Expand Down Expand Up @@ -243,7 +245,7 @@ struct btrfs_block_group {
u64 zone_unusable;
u64 zone_capacity;
u64 meta_write_pointer;
struct map_lookup *physical_map;
struct btrfs_chunk_map *physical_map;
struct list_head active_bg_list;
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
Expand Down Expand Up @@ -297,7 +299,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em);
struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
Expand Down
Loading

0 comments on commit 7dc66ab

Please sign in to comment.