Skip to content

Commit

Permalink
md: implment REQ_FLUSH/FUA support
Browse files Browse the repository at this point in the history
This patch converts md to support REQ_FLUSH/FUA instead of now
deprecated REQ_HARDBARRIER.  In the core part (md.c), the following
changes are notable.

* Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA don't interfere with
  processing of other requests and thus there is no reason to mark the
  queue congested while FLUSH/FUA is in progress.

* REQ_FLUSH/FUA failures are final and its users don't need retry
  logic.  Retry logic is removed.

* Preflush needs to be issued to all member devices but FUA writes can
  be handled the same way as other writes - their processing can be
  deferred to request_queue of member devices.  md_barrier_request()
  is renamed to md_flush_request() and simplified accordingly.

For linear, raid0 and multipath, the core changes are enough.  raid1,
5 and 10 need the following conversions.

* raid1: Handling of FLUSH/FUA bio's can simply be deferred to
  request_queues of member devices.  Barrier related logic removed.

* raid5: Queue draining logic dropped.  FUA bit is propagated through
  biodrain and stripe resconstruction such that all the updated parts
  of the stripe are written out with FUA writes if any of the dirtying
  writes was FUA.  preread_active_stripes handling in make_request()
  is updated as suggested by Neil Brown.

* raid10: FUA bit needs to be propagated to write clones.

linear, raid0, 1, 5 and 10 tested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
  • Loading branch information
Tejun Heo authored and Jens Axboe committed Sep 10, 2010
1 parent 7bc9fdd commit e9c7469
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 259 deletions.
4 changes: 2 additions & 2 deletions drivers/md/linear.c
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
dev_info_t *tmp_dev;
sector_t start_sector;

if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
md_barrier_request(mddev, bio);
if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bio);
return 0;
}

Expand Down
117 changes: 25 additions & 92 deletions drivers/md/md.c
Original file line number Diff line number Diff line change
Expand Up @@ -226,12 +226,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
return 0;
}
rcu_read_lock();
if (mddev->suspended || mddev->barrier) {
if (mddev->suspended) {
DEFINE_WAIT(__wait);
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE);
if (!mddev->suspended && !mddev->barrier)
if (!mddev->suspended)
break;
rcu_read_unlock();
schedule();
Expand Down Expand Up @@ -282,40 +282,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);

int mddev_congested(mddev_t *mddev, int bits)
{
if (mddev->barrier)
return 1;
return mddev->suspended;
}
EXPORT_SYMBOL(mddev_congested);

/*
* Generic barrier handling for md
* Generic flush handling for md
*/

#define POST_REQUEST_BARRIER ((void*)1)

static void md_end_barrier(struct bio *bio, int err)
static void md_end_flush(struct bio *bio, int err)
{
mdk_rdev_t *rdev = bio->bi_private;
mddev_t *mddev = rdev->mddev;
if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);

rdev_dec_pending(rdev, mddev);

if (atomic_dec_and_test(&mddev->flush_pending)) {
if (mddev->barrier == POST_REQUEST_BARRIER) {
/* This was a post-request barrier */
mddev->barrier = NULL;
wake_up(&mddev->sb_wait);
} else
/* The pre-request barrier has finished */
schedule_work(&mddev->barrier_work);
/* The pre-request flush has finished */
schedule_work(&mddev->flush_work);
}
bio_put(bio);
}

static void submit_barriers(mddev_t *mddev)
static void submit_flushes(mddev_t *mddev)
{
mdk_rdev_t *rdev;

Expand All @@ -332,60 +321,56 @@ static void submit_barriers(mddev_t *mddev)
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
bi = bio_alloc(GFP_KERNEL, 0);
bi->bi_end_io = md_end_barrier;
bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
bi->bi_bdev = rdev->bdev;
atomic_inc(&mddev->flush_pending);
submit_bio(WRITE_BARRIER, bi);
submit_bio(WRITE_FLUSH, bi);
rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
}

static void md_submit_barrier(struct work_struct *ws)
static void md_submit_flush_data(struct work_struct *ws)
{
mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
struct bio *bio = mddev->barrier;
mddev_t *mddev = container_of(ws, mddev_t, flush_work);
struct bio *bio = mddev->flush_bio;

atomic_set(&mddev->flush_pending, 1);

if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
bio_endio(bio, -EOPNOTSUPP);
else if (bio->bi_size == 0)
if (bio->bi_size == 0)
/* an empty barrier - all done */
bio_endio(bio, 0);
else {
bio->bi_rw &= ~REQ_HARDBARRIER;
bio->bi_rw &= ~REQ_FLUSH;
if (mddev->pers->make_request(mddev, bio))
generic_make_request(bio);
mddev->barrier = POST_REQUEST_BARRIER;
submit_barriers(mddev);
}
if (atomic_dec_and_test(&mddev->flush_pending)) {
mddev->barrier = NULL;
mddev->flush_bio = NULL;
wake_up(&mddev->sb_wait);
}
}

void md_barrier_request(mddev_t *mddev, struct bio *bio)
void md_flush_request(mddev_t *mddev, struct bio *bio)
{
spin_lock_irq(&mddev->write_lock);
wait_event_lock_irq(mddev->sb_wait,
!mddev->barrier,
!mddev->flush_bio,
mddev->write_lock, /*nothing*/);
mddev->barrier = bio;
mddev->flush_bio = bio;
spin_unlock_irq(&mddev->write_lock);

atomic_set(&mddev->flush_pending, 1);
INIT_WORK(&mddev->barrier_work, md_submit_barrier);
INIT_WORK(&mddev->flush_work, md_submit_flush_data);

submit_barriers(mddev);
submit_flushes(mddev);

if (atomic_dec_and_test(&mddev->flush_pending))
schedule_work(&mddev->barrier_work);
schedule_work(&mddev->flush_work);
}
EXPORT_SYMBOL(md_barrier_request);
EXPORT_SYMBOL(md_flush_request);

/* Support for plugging.
* This mirrors the plugging support in request_queue, but does not
Expand Down Expand Up @@ -696,31 +681,6 @@ static void super_written(struct bio *bio, int error)
bio_put(bio);
}

static void super_written_barrier(struct bio *bio, int error)
{
struct bio *bio2 = bio->bi_private;
mdk_rdev_t *rdev = bio2->bi_private;
mddev_t *mddev = rdev->mddev;

if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
error == -EOPNOTSUPP) {
unsigned long flags;
/* barriers don't appear to be supported :-( */
set_bit(BarriersNotsupp, &rdev->flags);
mddev->barriers_work = 0;
spin_lock_irqsave(&mddev->write_lock, flags);
bio2->bi_next = mddev->biolist;
mddev->biolist = bio2;
spin_unlock_irqrestore(&mddev->write_lock, flags);
wake_up(&mddev->sb_wait);
bio_put(bio);
} else {
bio_put(bio2);
bio->bi_private = rdev;
super_written(bio, error);
}
}

void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page)
{
Expand All @@ -729,51 +689,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
* and decrement it on completion, waking up sb_wait
* if zero is reached.
* If an error occurred, call md_error
*
* As we might need to resubmit the request if REQ_HARDBARRIER
* causes ENOTSUPP, we allocate a spare bio...
*/
struct bio *bio = bio_alloc(GFP_NOIO, 1);
int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;

bio->bi_bdev = rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
bio->bi_rw = rw;

atomic_inc(&mddev->pending_writes);
if (!test_bit(BarriersNotsupp, &rdev->flags)) {
struct bio *rbio;
rw |= REQ_HARDBARRIER;
rbio = bio_clone(bio, GFP_NOIO);
rbio->bi_private = bio;
rbio->bi_end_io = super_written_barrier;
submit_bio(rw, rbio);
} else
submit_bio(rw, bio);
submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
bio);
}

void md_super_wait(mddev_t *mddev)
{
/* wait for all superblock writes that were scheduled to complete.
* if any had to be retried (due to BARRIER problems), retry them
*/
/* wait for all superblock writes that were scheduled to complete */
DEFINE_WAIT(wq);
for(;;) {
prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
if (atomic_read(&mddev->pending_writes)==0)
break;
while (mddev->biolist) {
struct bio *bio;
spin_lock_irq(&mddev->write_lock);
bio = mddev->biolist;
mddev->biolist = bio->bi_next ;
bio->bi_next = NULL;
spin_unlock_irq(&mddev->write_lock);
submit_bio(bio->bi_rw, bio);
}
schedule();
}
finish_wait(&mddev->sb_wait, &wq);
Expand Down Expand Up @@ -1070,7 +1007,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
clear_bit(BarriersNotsupp, &rdev->flags);

if (mddev->raid_disks == 0) {
mddev->major_version = 0;
Expand Down Expand Up @@ -1485,7 +1421,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
clear_bit(BarriersNotsupp, &rdev->flags);

if (mddev->raid_disks == 0) {
mddev->major_version = 1;
Expand Down Expand Up @@ -4506,7 +4441,6 @@ int md_run(mddev_t *mddev)
/* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;

mddev->barriers_work = 1;
mddev->ok_start_degraded = start_dirty_degraded;

if (start_readonly && mddev->ro == 0)
Expand Down Expand Up @@ -4685,7 +4619,6 @@ static void md_clean(mddev_t *mddev)
mddev->recovery = 0;
mddev->in_sync = 0;
mddev->degraded = 0;
mddev->barriers_work = 0;
mddev->safemode = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
Expand Down
23 changes: 6 additions & 17 deletions drivers/md/md.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ struct mdk_rdev_s
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */
#define AllReserved 6 /* If whole device is reserved for
* one array */
#define AutoDetected 7 /* added by auto-detect */
Expand Down Expand Up @@ -273,13 +272,6 @@ struct mddev_s
int degraded; /* whether md should consider
* adding a spare
*/
int barriers_work; /* initialised to true, cleared as soon
* as a barrier request to slave
* fails. Only supported
*/
struct bio *biolist; /* bios that need to be retried
* because REQ_HARDBARRIER is not supported
*/

atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
Expand Down Expand Up @@ -339,16 +331,13 @@ struct mddev_s
struct attribute_group *to_remove;
struct plug_handle *plug; /* if used by personality */

/* Generic barrier handling.
* If there is a pending barrier request, all other
* writes are blocked while the devices are flushed.
* The last to finish a flush schedules a worker to
* submit the barrier request (without the barrier flag),
* then submit more flush requests.
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
* the rest of the request (without the REQ_FLUSH flag).
*/
struct bio *barrier;
struct bio *flush_bio;
atomic_t flush_pending;
struct work_struct barrier_work;
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
};

Expand Down Expand Up @@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);

extern int mddev_congested(mddev_t *mddev, int bits);
extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
extern void md_flush_request(mddev_t *mddev, struct bio *bio);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
Expand Down
4 changes: 2 additions & 2 deletions drivers/md/multipath.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
struct multipath_bh * mp_bh;
struct multipath_info *multipath;

if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
md_barrier_request(mddev, bio);
if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bio);
return 0;
}

Expand Down
4 changes: 2 additions & 2 deletions drivers/md/raid0.c
Original file line number Diff line number Diff line change
Expand Up @@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
struct strip_zone *zone;
mdk_rdev_t *tmp_dev;

if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
md_barrier_request(mddev, bio);
if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bio);
return 0;
}

Expand Down
Loading

0 comments on commit e9c7469

Please sign in to comment.