Skip to content

Commit

Permalink
Btrfs: New data=ordered implementation
Browse files Browse the repository at this point in the history
The old data=ordered code would force commit to wait until
all the data extents from the transaction were fully on disk.  This
introduced large latencies into the commit and stalled new writers
in the transaction for a long time.

The new code changes the way data allocations and extents work:

* When delayed allocation is filled, data extents are reserved, and
  the extent bit EXTENT_ORDERED is set on the entire range of the extent.
  A struct btrfs_ordered_extent is allocated an inserted into a per-inode
  rbtree to track the pending extents.

* As each page is written EXTENT_ORDERED is cleared on the bytes corresponding
  to that page.

* When all of the bytes corresponding to a single struct btrfs_ordered_extent
  are written, The previously reserved extent is inserted into the FS
  btree and into the extent allocation trees.  The checksums for the file
  data are also updated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
  • Loading branch information
Chris Mason committed Sep 25, 2008
1 parent 77a41af commit e6dcd2d
Show file tree
Hide file tree
Showing 14 changed files with 910 additions and 502 deletions.
4 changes: 2 additions & 2 deletions fs/btrfs/btrfs_inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "extent_map.h"
#include "extent_io.h"
#include "ordered-data.h"

/* in memory btrfs inode */
struct btrfs_inode {
Expand All @@ -32,9 +33,8 @@ struct btrfs_inode {
struct extent_io_tree io_failure_tree;
struct mutex csum_mutex;
struct inode vfs_inode;
atomic_t ordered_writeback;
struct btrfs_ordered_inode_tree ordered_tree;

u64 ordered_trans;
/*
* transid of the trans_handle that last modified this inode
*/
Expand Down
19 changes: 17 additions & 2 deletions fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/completion.h>
#include <linux/backing-dev.h>
#include <linux/wait.h>
#include <asm/kmap_types.h>
#include "bit-radix.h"
#include "extent_io.h"
Expand All @@ -37,6 +38,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
struct btrfs_ordered_sum;

#define BTRFS_MAGIC "_B5RfS_M"

Expand Down Expand Up @@ -510,6 +512,7 @@ struct btrfs_fs_info {
u64 max_inline;
u64 alloc_start;
struct btrfs_transaction *running_transaction;
wait_queue_head_t transaction_throttle;
struct btrfs_super_block super_copy;
struct btrfs_super_block super_for_commit;
struct block_device *__bdev;
Expand Down Expand Up @@ -541,6 +544,7 @@ struct btrfs_fs_info {
*/
struct btrfs_workers workers;
struct btrfs_workers endio_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
Expand Down Expand Up @@ -1384,6 +1388,17 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
u64 owner, u64 owner_offset,
u64 empty_size, u64 hint_byte,
u64 search_end, struct btrfs_key *ins, u64 data);
int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 ref_generation,
u64 owner, u64 owner_offset,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
u64 search_end, struct btrfs_key *ins,
u64 data);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
Expand Down Expand Up @@ -1556,9 +1571,9 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
u64 bytenr, int mod);
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct bio *bio, char *sums);
struct btrfs_ordered_sum *sums);
int btrfs_csum_one_bio(struct btrfs_root *root,
struct bio *bio, char **sums_ret);
struct bio *bio, struct btrfs_ordered_sum **sums_ret);
struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
Expand Down
13 changes: 12 additions & 1 deletion fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,11 @@ static int end_workqueue_bio(struct bio *bio,
end_io_wq->error = err;
end_io_wq->work.func = end_workqueue_fn;
end_io_wq->work.flags = 0;
btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
if (bio->bi_rw & (1 << BIO_RW))
btrfs_queue_worker(&fs_info->endio_write_workers,
&end_io_wq->work);
else
btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);

#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
return 0;
Expand Down Expand Up @@ -1286,6 +1290,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
init_waitqueue_head(&fs_info->transaction_throttle);

#if 0
ret = add_hasher(fs_info, "crc32c");
Expand Down Expand Up @@ -1325,9 +1330,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->endio_write_workers,
fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_write_workers,
fs_info->thread_pool_size);

err = -EINVAL;
if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
Expand Down Expand Up @@ -1447,6 +1456,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
fail_iput:
iput(fs_info->btree_inode);
Expand Down Expand Up @@ -1702,6 +1712,7 @@ int close_ctree(struct btrfs_root *root)

btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);

iput(fs_info->btree_inode);
Expand Down
132 changes: 93 additions & 39 deletions fs/btrfs/extent-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -1895,36 +1895,17 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
return ret;
}

/*
* finds a free extent and does all the dirty work required for allocation
* returns the key for the extent through ins, and a tree buffer for
* the first block of the extent through buf.
*
* returns 0 if everything worked, non-zero otherwise.
*/
int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 root_objectid, u64 ref_generation,
u64 owner, u64 owner_offset,
u64 empty_size, u64 hint_byte,
u64 search_end, struct btrfs_key *ins, u64 data)
static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
u64 search_end, struct btrfs_key *ins,
u64 data)
{
int ret;
int pending_ret;
u64 super_used;
u64 root_used;
u64 search_start = 0;
u64 alloc_profile;
u32 sizes[2];
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_root *extent_root = info->extent_root;
struct btrfs_extent_item *extent_item;
struct btrfs_extent_ref *ref;
struct btrfs_path *path;
struct btrfs_key keys[2];

maybe_lock_mutex(root);

if (data) {
alloc_profile = info->avail_data_alloc_bits &
Expand Down Expand Up @@ -1974,11 +1955,48 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
}
if (ret) {
printk("allocation failed flags %Lu\n", data);
}
if (ret) {
BUG();
goto out;
}
clear_extent_dirty(&root->fs_info->free_space_cache,
ins->objectid, ins->objectid + ins->offset - 1,
GFP_NOFS);
return 0;
}

int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
u64 search_end, struct btrfs_key *ins,
u64 data)
{
int ret;
maybe_lock_mutex(root);
ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
empty_size, hint_byte, search_end, ins,
data);
maybe_unlock_mutex(root);
return ret;
}

static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 ref_generation,
u64 owner, u64 owner_offset,
struct btrfs_key *ins)
{
int ret;
int pending_ret;
u64 super_used;
u64 root_used;
u64 num_bytes = ins->offset;
u32 sizes[2];
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_root *extent_root = info->extent_root;
struct btrfs_extent_item *extent_item;
struct btrfs_extent_ref *ref;
struct btrfs_path *path;
struct btrfs_key keys[2];

/* block accounting for super block */
spin_lock_irq(&info->delalloc_lock);
Expand All @@ -1990,21 +2008,13 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
root_used = btrfs_root_used(&root->root_item);
btrfs_set_root_used(&root->root_item, root_used + num_bytes);

clear_extent_dirty(&root->fs_info->free_space_cache,
ins->objectid, ins->objectid + ins->offset - 1,
GFP_NOFS);

if (root == extent_root) {
set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
ins->objectid + ins->offset - 1,
EXTENT_LOCKED, GFP_NOFS);
goto update_block;
}

WARN_ON(trans->alloc_exclude_nr);
trans->alloc_exclude_start = ins->objectid;
trans->alloc_exclude_nr = ins->offset;

memcpy(&keys[0], ins, sizeof(*ins));
keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
owner, owner_offset);
Expand Down Expand Up @@ -2054,6 +2064,51 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
BUG();
}
out:
return ret;
}

int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 ref_generation,
u64 owner, u64 owner_offset,
struct btrfs_key *ins)
{
int ret;
maybe_lock_mutex(root);
ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
ref_generation, owner,
owner_offset, ins);
maybe_unlock_mutex(root);
return ret;
}
/*
* finds a free extent and does all the dirty work required for allocation
* returns the key for the extent through ins, and a tree buffer for
* the first block of the extent through buf.
*
* returns 0 if everything worked, non-zero otherwise.
*/
int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 root_objectid, u64 ref_generation,
u64 owner, u64 owner_offset,
u64 empty_size, u64 hint_byte,
u64 search_end, struct btrfs_key *ins, u64 data)
{
int ret;

maybe_lock_mutex(root);

ret = __btrfs_reserve_extent(trans, root, num_bytes,
min_alloc_size, empty_size, hint_byte,
search_end, ins, data);
BUG_ON(ret);
ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
ref_generation, owner,
owner_offset, ins);
BUG_ON(ret);

maybe_unlock_mutex(root);
return ret;
}
Expand Down Expand Up @@ -2288,8 +2343,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->alloc_mutex);

/* we've dropped the lock, double check */
ret = drop_snap_lookup_refcount(root, bytenr,
blocksize, &refs);
ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
&refs);
BUG_ON(ret);
if (refs != 1) {
parent = path->nodes[*level];
Expand Down Expand Up @@ -2584,7 +2639,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
kfree(ra);
trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
if (trans) {
btrfs_add_ordered_inode(inode);
btrfs_end_transaction(trans, BTRFS_I(inode)->root);
mark_inode_dirty(inode);
}
Expand Down
Loading

0 comments on commit e6dcd2d

Please sign in to comment.