Skip to content

Commit

Permalink
Btrfs: use WRITE_SYNC for synchronous writes
Browse files Browse the repository at this point in the history
Part of reducing fsync/O_SYNC/O_DIRECT latencies is using WRITE_SYNC for
writes we plan on waiting on in the near future.  This patch
mirrors recent changes in other filesystems and the generic code to
use WRITE_SYNC when WB_SYNC_ALL is passed and to use WRITE_SYNC for
other latency critical writes.

Btrfs uses async worker threads for checksumming before the write is done,
and then again to actually submit the bios.  The bio submission code just
runs a per-device list of bios that need to be sent down the pipe.

This list is split into low priority and high priority lists so the
WRITE_SYNC IO happens first.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
  • Loading branch information
Chris Mason committed Apr 20, 2009
1 parent 0882e8d commit ffbd517
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 46 deletions.
4 changes: 2 additions & 2 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -2095,10 +2095,10 @@ static int write_dev_supers(struct btrfs_device *device,
device->barriers = 0;
get_bh(bh);
lock_buffer(bh);
ret = submit_bh(WRITE, bh);
ret = submit_bh(WRITE_SYNC, bh);
}
} else {
ret = submit_bh(WRITE, bh);
ret = submit_bh(WRITE_SYNC, bh);
}

if (!ret && wait) {
Expand Down
44 changes: 30 additions & 14 deletions fs/btrfs/extent_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,10 @@ struct extent_page_data {
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
int extent_locked;
unsigned int extent_locked:1;

/* tells the submit_bio code to use a WRITE_SYNC */
unsigned int sync_io:1;
};

int __init extent_io_init(void)
Expand Down Expand Up @@ -2136,8 +2139,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 delalloc_end;
int page_started;
int compressed;
int write_flags;
unsigned long nr_written = 0;

if (wbc->sync_mode == WB_SYNC_ALL)
write_flags = WRITE_SYNC_PLUG;
else
write_flags = WRITE;

WARN_ON(!PageLocked(page));
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
Expand Down Expand Up @@ -2314,9 +2323,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
(unsigned long long)end);
}

ret = submit_extent_page(WRITE, tree, page, sector,
iosize, pg_offset, bdev,
&epd->bio, max_nr,
ret = submit_extent_page(write_flags, tree, page,
sector, iosize, pg_offset,
bdev, &epd->bio, max_nr,
end_bio_extent_writepage,
0, 0, 0);
if (ret)
Expand Down Expand Up @@ -2460,15 +2469,23 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
return ret;
}

static noinline void flush_write_bio(void *data)
static void flush_epd_write_bio(struct extent_page_data *epd)
{
struct extent_page_data *epd = data;
if (epd->bio) {
submit_one_bio(WRITE, epd->bio, 0, 0);
if (epd->sync_io)
submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
else
submit_one_bio(WRITE, epd->bio, 0, 0);
epd->bio = NULL;
}
}

static noinline void flush_write_bio(void *data)
{
struct extent_page_data *epd = data;
flush_epd_write_bio(epd);
}

int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent,
struct writeback_control *wbc)
Expand All @@ -2480,6 +2497,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.tree = tree,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
struct writeback_control wbc_writepages = {
.bdi = wbc->bdi,
Expand All @@ -2490,13 +2508,11 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.range_end = (loff_t)-1,
};


ret = __extent_writepage(page, wbc, &epd);

extent_write_cache_pages(tree, mapping, &wbc_writepages,
__extent_writepage, &epd, flush_write_bio);
if (epd.bio)
submit_one_bio(WRITE, epd.bio, 0, 0);
flush_epd_write_bio(&epd);
return ret;
}

Expand All @@ -2515,6 +2531,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
.tree = tree,
.get_extent = get_extent,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
struct writeback_control wbc_writepages = {
.bdi = inode->i_mapping->backing_dev_info,
Expand All @@ -2540,8 +2557,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
start += PAGE_CACHE_SIZE;
}

if (epd.bio)
submit_one_bio(WRITE, epd.bio, 0, 0);
flush_epd_write_bio(&epd);
return ret;
}

Expand All @@ -2556,13 +2572,13 @@ int extent_writepages(struct extent_io_tree *tree,
.tree = tree,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};

ret = extent_write_cache_pages(tree, mapping, wbc,
__extent_writepage, &epd,
flush_write_bio);
if (epd.bio)
submit_one_bio(WRITE, epd.bio, 0, 0);
flush_epd_write_bio(&epd);
return ret;
}

Expand Down
2 changes: 1 addition & 1 deletion fs/btrfs/ordered-data.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
/* start IO across the range first to instantiate any delalloc
* extents
*/
btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);

/* The compression code will leave pages locked but return from
* writepage without setting the page writeback. Starting again
Expand Down
124 changes: 97 additions & 27 deletions fs/btrfs/volumes.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
return NULL;
}

static void requeue_list(struct btrfs_pending_bios *pending_bios,
struct bio *head, struct bio *tail)
{

struct bio *old_head;

old_head = pending_bios->head;
pending_bios->head = head;
if (pending_bios->tail)
tail->bi_next = old_head;
else
pending_bios->tail = tail;
}

/*
* we try to collect pending bios for a device so we don't get a large
* number of procs sending bios down to the same device. This greatly
Expand All @@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
struct bio *pending;
struct backing_dev_info *bdi;
struct btrfs_fs_info *fs_info;
struct btrfs_pending_bios *pending_bios;
struct bio *tail;
struct bio *cur;
int again = 0;
unsigned long num_run = 0;
unsigned long num_run;
unsigned long num_sync_run;
unsigned long limit;
unsigned long last_waited = 0;

Expand All @@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;

/* we want to make sure that every time we switch from the sync
* list to the normal list, we unplug
*/
num_sync_run = 0;

loop:
spin_lock(&device->io_lock);
num_run = 0;

loop_lock:

/* take all the bios off the list at once and process them
* later on (without the lock held). But, remember the
* tail and other pointers so the bios can be properly reinserted
* into the list if we hit congestion
*/
pending = device->pending_bios;
tail = device->pending_bio_tail;
if (device->pending_sync_bios.head)
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;

pending = pending_bios->head;
tail = pending_bios->tail;
WARN_ON(pending && !tail);
device->pending_bios = NULL;
device->pending_bio_tail = NULL;

/*
* if pending was null this time around, no bios need processing
Expand All @@ -176,16 +202,41 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
* device->running_pending is used to synchronize with the
* schedule_bio code.
*/
if (pending) {
again = 1;
device->running_pending = 1;
} else {
if (device->pending_sync_bios.head == NULL &&
device->pending_bios.head == NULL) {
again = 0;
device->running_pending = 0;
} else {
again = 1;
device->running_pending = 1;
}

pending_bios->head = NULL;
pending_bios->tail = NULL;

spin_unlock(&device->io_lock);

/*
* if we're doing the regular priority list, make sure we unplug
* for any high prio bios we've sent down
*/
if (pending_bios == &device->pending_bios && num_sync_run > 0) {
num_sync_run = 0;
blk_run_backing_dev(bdi, NULL);
}

while (pending) {

rmb();
if (pending_bios != &device->pending_sync_bios &&
device->pending_sync_bios.head &&
num_run > 16) {
cond_resched();
spin_lock(&device->io_lock);
requeue_list(pending_bios, pending, tail);
goto loop_lock;
}

cur = pending;
pending = pending->bi_next;
cur->bi_next = NULL;
Expand All @@ -196,10 +247,18 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
wake_up(&fs_info->async_submit_wait);

BUG_ON(atomic_read(&cur->bi_cnt) == 0);
bio_get(cur);
submit_bio(cur->bi_rw, cur);
bio_put(cur);
num_run++;
if (bio_sync(cur))
num_sync_run++;

if (need_resched()) {
if (num_sync_run) {
blk_run_backing_dev(bdi, NULL);
num_sync_run = 0;
}
cond_resched();
}

/*
* we made progress, there is more work to do and the bdi
Expand All @@ -208,7 +267,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
*/
if (pending && bdi_write_congested(bdi) && num_run > 16 &&
fs_info->fs_devices->open_devices > 1) {
struct bio *old_head;
struct io_context *ioc;

ioc = current->io_context;
Expand All @@ -233,29 +291,36 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
* against it before looping
*/
last_waited = ioc->last_waited;
if (need_resched()) {
if (num_sync_run) {
blk_run_backing_dev(bdi, NULL);
num_sync_run = 0;
}
cond_resched();
}
continue;
}
spin_lock(&device->io_lock);

old_head = device->pending_bios;
device->pending_bios = pending;
if (device->pending_bio_tail)
tail->bi_next = old_head;
else
device->pending_bio_tail = tail;

requeue_list(pending_bios, pending, tail);
device->running_pending = 1;

spin_unlock(&device->io_lock);
btrfs_requeue_work(&device->work);
goto done;
}
}

if (num_sync_run) {
num_sync_run = 0;
blk_run_backing_dev(bdi, NULL);
}

cond_resched();
if (again)
goto loop;

spin_lock(&device->io_lock);
if (device->pending_bios)
if (device->pending_bios.head || device->pending_sync_bios.head)
goto loop_lock;
spin_unlock(&device->io_lock);

Expand Down Expand Up @@ -2497,7 +2562,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
max_errors = 1;
}
}
if (multi_ret && rw == WRITE &&
if (multi_ret && (rw & (1 << BIO_RW)) &&
stripes_allocated < stripes_required) {
stripes_allocated = map->num_stripes;
free_extent_map(em);
Expand Down Expand Up @@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
int rw, struct bio *bio)
{
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;

/* don't bother with additional async steps for reads, right now */
if (!(rw & (1 << BIO_RW))) {
Expand All @@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
bio->bi_rw |= rw;

spin_lock(&device->io_lock);
if (bio_sync(bio))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;

if (device->pending_bio_tail)
device->pending_bio_tail->bi_next = bio;
if (pending_bios->tail)
pending_bios->tail->bi_next = bio;

device->pending_bio_tail = bio;
if (!device->pending_bios)
device->pending_bios = bio;
pending_bios->tail = bio;
if (!pending_bios->head)
pending_bios->head = bio;
if (device->running_pending)
should_queue = 0;

Expand Down
Loading

0 comments on commit ffbd517

Please sign in to comment.