Skip to content

Commit

Permalink
Btrfs: Optimize compressed writeback and reads
Browse files Browse the repository at this point in the history
When reading compressed extents, try to put pages into the page cache
for any pages covered by the compressed extent that readpages didn't already
preload.

Add an async work queue to handle transformations at delayed allocation processing
time.  Right now this is just compression.  The workflow is:

1) Find offsets in the file marked for delayed allocation
2) Lock the pages
3) Lock the state bits
4) Call the async delalloc code

The async delalloc code clears the state lock bits and delalloc bits.  It is
important this happens before the range goes into the work queue because
otherwise it might deadlock with other work queue items that try to lock
those extent bits.

The file pages are compressed, and if the compression doesn't work the
pages are written back directly.

An ordered work queue is used to make sure the inodes are written in the same
order that pdflush or writepages sent them down.

This changes extent_write_cache_pages to let the writepage function
update the wbc nr_written count.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
  • Loading branch information
Chris Mason committed Nov 7, 2008
1 parent 4a69a41 commit 771ed68
Show file tree
Hide file tree
Showing 11 changed files with 849 additions and 207 deletions.
150 changes: 145 additions & 5 deletions fs/btrfs/compression.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <linux/writeback.h>
#include <linux/bit_spinlock.h>
#include <linux/version.h>
#include <linux/pagevec.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
Expand Down Expand Up @@ -145,9 +146,9 @@ static void end_compressed_bio_read(struct bio *bio, int err)
}

/* do io completion on the original bio */
if (cb->errors)
if (cb->errors) {
bio_io_error(cb->orig_bio);
else
} else
bio_endio(cb->orig_bio, 0);

/* finally free the cb struct */
Expand Down Expand Up @@ -333,6 +334,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
}
bytes_left -= PAGE_CACHE_SIZE;
first_byte += PAGE_CACHE_SIZE;
cond_resched();
}
bio_get(bio);

Expand All @@ -346,6 +348,130 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
return 0;
}

static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
struct compressed_bio *cb)
{
unsigned long end_index;
unsigned long page_index;
u64 last_offset;
u64 isize = i_size_read(inode);
int ret;
struct page *page;
unsigned long nr_pages = 0;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct pagevec pvec;
struct extent_map_tree *em_tree;
struct extent_io_tree *tree;
u64 end;
int misses = 0;

page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;

if (isize == 0)
return 0;

end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;

pagevec_init(&pvec, 0);
while(last_offset < compressed_end) {
page_index = last_offset >> PAGE_CACHE_SHIFT;

if (page_index > end_index)
break;

rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_index);
rcu_read_unlock();
if (page) {
misses++;
if (misses > 4)
break;
goto next;
}

page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
if (!page)
break;

page->index = page_index;
/*
* what we want to do here is call add_to_page_cache_lru,
* but that isn't exported, so we reproduce it here
*/
if (add_to_page_cache(page, mapping,
page->index, GFP_NOFS)) {
page_cache_release(page);
goto next;
}

/* open coding of lru_cache_add, also not exported */
page_cache_get(page);
if (!pagevec_add(&pvec, page))
__pagevec_lru_add(&pvec);

end = last_offset + PAGE_CACHE_SIZE - 1;
/*
* at this point, we have a locked page in the page cache
* for these bytes in the file. But, we have to make
* sure they map to this compressed extent on disk.
*/
set_page_extent_mapped(page);
lock_extent(tree, last_offset, end, GFP_NOFS);
spin_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
PAGE_CACHE_SIZE);
spin_unlock(&em_tree->lock);

if (!em || last_offset < em->start ||
(last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_sector) {
free_extent_map(em);
unlock_extent(tree, last_offset, end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
break;
}
free_extent_map(em);

if (page->index == end_index) {
char *userpage;
size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);

if (zero_offset) {
int zeros;
zeros = PAGE_CACHE_SIZE - zero_offset;
userpage = kmap_atomic(page, KM_USER0);
memset(userpage + zero_offset, 0, zeros);
flush_dcache_page(page);
kunmap_atomic(userpage, KM_USER0);
}
}

ret = bio_add_page(cb->orig_bio, page,
PAGE_CACHE_SIZE, 0);

if (ret == PAGE_CACHE_SIZE) {
nr_pages++;
page_cache_release(page);
} else {
unlock_extent(tree, last_offset, end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
break;
}
next:
last_offset += PAGE_CACHE_SIZE;
}
if (pagevec_count(&pvec))
__pagevec_lru_add(&pvec);
return 0;
}

/*
* for a compressed read, the bio we get passed has all the inode pages
* in it. We don't actually do IO on those pages but allocate new ones
Expand Down Expand Up @@ -373,6 +499,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct block_device *bdev;
struct bio *comp_bio;
u64 cur_disk_byte = (u64)bio->bi_sector << 9;
u64 em_len;
struct extent_map *em;
int ret;

Expand All @@ -393,6 +520,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,

cb->start = em->start;
compressed_len = em->block_len;
em_len = em->len;
free_extent_map(em);

cb->len = uncompressed_len;
Expand All @@ -411,6 +539,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
cb->nr_pages = nr_pages;

add_ra_bio_pages(inode, cb->start + em_len, cb);

if (!btrfs_test_opt(root, NODATASUM) &&
!btrfs_test_flag(inode, NODATASUM)) {
btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
}

/* include any pages we added in add_ra-bio_pages */
uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
cb->len = uncompressed_len;

comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
Expand Down Expand Up @@ -442,9 +581,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
GFP_NOFS);
atomic_inc(&cb->pending_bios);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;

bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
}
cur_disk_byte += PAGE_CACHE_SIZE;
}
Expand Down
4 changes: 3 additions & 1 deletion fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -625,8 +625,8 @@ struct btrfs_fs_info {
struct btrfs_transaction *running_transaction;
wait_queue_head_t transaction_throttle;
wait_queue_head_t transaction_wait;
wait_queue_head_t async_submit_wait;

wait_queue_head_t async_submit_wait;
wait_queue_head_t tree_log_wait;

struct btrfs_super_block super_copy;
Expand All @@ -653,6 +653,7 @@ struct btrfs_fs_info {
atomic_t nr_async_submits;
atomic_t async_submit_draining;
atomic_t nr_async_bios;
atomic_t async_delalloc_pages;
atomic_t tree_log_writers;
atomic_t tree_log_commit;
unsigned long tree_log_batch;
Expand All @@ -677,6 +678,7 @@ struct btrfs_fs_info {
* two
*/
struct btrfs_workers workers;
struct btrfs_workers delalloc_workers;
struct btrfs_workers endio_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
Expand Down
27 changes: 19 additions & 8 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
(atomic_read(&fs_info->nr_async_bios) < limit),
HZ/10);
}

while(atomic_read(&fs_info->async_submit_draining) &&
atomic_read(&fs_info->nr_async_submits)) {
wait_event(fs_info->async_submit_wait,
(atomic_read(&fs_info->nr_async_submits) == 0));
}

return 0;
}

Expand Down Expand Up @@ -1437,6 +1444,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->space_info);
btrfs_mapping_init(&fs_info->mapping_tree);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->throttles, 0);
Expand Down Expand Up @@ -1550,6 +1558,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->workers, "worker",
fs_info->thread_pool_size);

btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
fs_info->thread_pool_size);

btrfs_init_workers(&fs_info->submit_workers, "submit",
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size));
Expand All @@ -1560,15 +1571,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
*/
fs_info->submit_workers.idle_thresh = 64;

/* fs_info->workers is responsible for checksumming file data
* blocks and metadata. Using a larger idle thresh allows each
* worker thread to operate on things in roughly the order they
* were sent by the writeback daemons, improving overall locality
* of the IO going down the pipe.
*/
fs_info->workers.idle_thresh = 8;
fs_info->workers.idle_thresh = 16;
fs_info->workers.ordered = 1;

fs_info->delalloc_workers.idle_thresh = 2;
fs_info->delalloc_workers.ordered = 1;

btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
btrfs_init_workers(&fs_info->endio_workers, "endio",
fs_info->thread_pool_size);
Expand All @@ -1584,6 +1592,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,

btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->delalloc_workers, 1);
btrfs_start_workers(&fs_info->fixup_workers, 1);
btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_write_workers,
Expand Down Expand Up @@ -1732,6 +1741,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fail_sys_array:
fail_sb_buffer:
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
Expand Down Expand Up @@ -1988,6 +1998,7 @@ int close_ctree(struct btrfs_root *root)
truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);

btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
Expand Down Expand Up @@ -2062,7 +2073,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
struct extent_io_tree *tree;
u64 num_dirty;
u64 start = 0;
unsigned long thresh = 96 * 1024 * 1024;
unsigned long thresh = 32 * 1024 * 1024;
tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;

if (current_is_pdflush() || current->flags & PF_MEMALLOC)
Expand Down
6 changes: 5 additions & 1 deletion fs/btrfs/extent-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -768,7 +768,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
l = path->nodes[0];

btrfs_item_key_to_cpu(l, &key, path->slots[0]);
BUG_ON(key.objectid != bytenr);
if (key.objectid != bytenr) {
btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
BUG();
}
BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);

item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
Expand Down
Loading

0 comments on commit 771ed68

Please sign in to comment.