Skip to content

Commit

Permalink
Btrfs: Fix streaming read performance with checksumming on
Browse files Browse the repository at this point in the history
Large streaming reads make for large bios, which means each entry on the
list async work queues represents a large amount of data.  IO
congestion throttling on the device was kicking in before the async
worker threads decided a single thread was busy and needed some help.

The end result was that a streaming read would result in a single CPU
running at 100% instead of balancing the work off to other CPUs.

This patch also changes the pre-IO checksum lookup done by reads to
work on a per-bio basis instead of a per-page.  This results in many
extra btree lookups on large streaming reads.  Doing the checksum lookup
right before bio submit allows us to reuse searches while processing
adjacent offsets.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
  • Loading branch information
Chris Mason committed Sep 25, 2008
1 parent 37d1aee commit 61b4944
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 54 deletions.
2 changes: 1 addition & 1 deletion fs/btrfs/async-thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
INIT_LIST_HEAD(&workers->idle_list);
spin_lock_init(&workers->lock);
workers->max_workers = max;
workers->idle_thresh = 64;
workers->idle_thresh = 32;
}

/*
Expand Down
2 changes: 2 additions & 0 deletions fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -1613,6 +1613,8 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
struct btrfs_key *location, int mod);

/* file-item.c */
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos, u64 disk_offset,
Expand Down
15 changes: 15 additions & 0 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -1357,10 +1357,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
*/
btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);

/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
* devices
*/
fs_info->submit_workers.idle_thresh = 64;

btrfs_init_workers(&fs_info->fixup_workers, 1);
btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->endio_write_workers,
fs_info->thread_pool_size);

/*
* endios are largely parallel and should have a very
* low idle thresh
*/
fs_info->endio_workers.idle_thresh = 4;
fs_info->endio_write_workers.idle_thresh = 4;

btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->fixup_workers, 1);
Expand Down
77 changes: 77 additions & 0 deletions fs/btrfs/file-item.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,83 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}

int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio)
{
u32 sum;
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
u64 offset;
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u32 diff;
int ret;
struct btrfs_path *path;
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;

path = btrfs_alloc_path();
path->reada = 2;

WARN_ON(bio->bi_vcnt <= 0);

while(bio_index < bio->bi_vcnt) {
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
ret = btrfs_find_ordered_sum(inode, offset, &sum);
if (ret == 0)
goto found;

if (!item || offset < item_start_offset ||
offset >= item_last_offset) {
struct btrfs_key found_key;
u32 item_size;

if (item)
btrfs_release_path(root, path);
item = btrfs_lookup_csum(NULL, root, path,
inode->i_ino, offset, 0);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
if (ret == -ENOENT || ret == -EFBIG)
ret = 0;
sum = 0;
printk("no csum found for inode %lu start "
"%llu\n", inode->i_ino,
(unsigned long long)offset);
goto found;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);

item_start_offset = found_key.offset;
item_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
item_last_offset = item_start_offset +
(item_size / BTRFS_CRC32_SIZE) *
root->sectorsize;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
}
/*
* this byte range must be able to fit inside
* a single leaf so it will also fit inside a u32
*/
diff = offset - item_start_offset;
diff = diff / root->sectorsize;
diff = diff * BTRFS_CRC32_SIZE;

read_extent_buffer(path->nodes[0], &sum,
(unsigned long)item + diff,
BTRFS_CRC32_SIZE);
found:
set_state_private(io_tree, offset, sum);
bio_index++;
bvec++;
}
btrfs_free_path(path);
return 0;
}

int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio)
{
Expand Down
57 changes: 4 additions & 53 deletions fs/btrfs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
BUG_ON(ret);

if (!(rw & (1 << BIO_RW))) {
if (!btrfs_test_opt(root, NODATASUM) &&
!btrfs_test_flag(inode, NODATASUM)) {
btrfs_lookup_bio_sums(root, inode, bio);
}
goto mapit;
}

Expand Down Expand Up @@ -598,58 +602,6 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
return btrfs_finish_ordered_io(page->mapping->host, start, end);
}

int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
{
int ret = 0;
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_csum_item *item;
struct btrfs_path *path = NULL;
u32 csum;

if (btrfs_test_opt(root, NODATASUM) ||
btrfs_test_flag(inode, NODATASUM))
return 0;

/*
* It is possible there is an ordered extent that has
* not yet finished for this range in the file. If so,
* that extent will have a csum cached, and it will insert
* the sum after all the blocks in the extent are fully
* on disk. So, look for an ordered extent and use the
* sum if found. We have to do this before looking in the
* btree because csum items are pre-inserted based on
* the file size. btrfs_lookup_csum might find an item
* that still hasn't been fully filled.
*/
ret = btrfs_find_ordered_sum(inode, start, &csum);
if (ret == 0)
goto found;

ret = 0;
path = btrfs_alloc_path();
item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
/* a csum that isn't present is a preallocated region. */
if (ret == -ENOENT || ret == -EFBIG)
ret = 0;
csum = 0;
printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
start);
goto out;
}
read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
BTRFS_CRC32_SIZE);
found:
set_state_private(io_tree, start, csum);
out:
if (path)
btrfs_free_path(path);
return ret;
}

struct io_failure_record {
struct page *page;
u64 start;
Expand Down Expand Up @@ -3613,7 +3565,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
.fill_delalloc = run_delalloc_range,
.submit_bio_hook = btrfs_submit_bio_hook,
.merge_bio_hook = btrfs_merge_bio_hook,
.readpage_io_hook = btrfs_readpage_io_hook,
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
.writepage_end_io_hook = btrfs_writepage_end_io_hook,
.writepage_start_hook = btrfs_writepage_start_hook,
Expand Down

0 comments on commit 61b4944

Please sign in to comment.