Skip to content

Commit

Permalink
[PATCH] md: add write-intent-bitmap support to raid5
Browse files Browse the repository at this point in the history
Most awkward part of this is delaying write requests until bitmap updates have
been flushed.

To achieve this, we have a sequence number (seq_flush) which is incremented
each time the raid5 is unplugged.

If the raid thread notices that this has changed, it flushes bitmap changes,
and assigned the value of seq_flush to seq_write.

When a write request arrives, it is given the number from seq_write, and that
write request may not complete until seq_flush is larger than the saved seq
number.

We have a new queue for storing stripes which are waiting for a bitmap flush
and an extra flag for stripes to record if the write was 'degraded' and so
should not clear the a bit in the bitmap.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
NeilBrown authored and Linus Torvalds committed Sep 9, 2005
1 parent 0002b27 commit 7262668
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 13 deletions.
3 changes: 1 addition & 2 deletions drivers/md/md.c
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)

if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
mddev->bitmap_file == NULL) {
if (mddev->level != 1) {
if (mddev->level != 1 && mddev->level != 5) {
/* FIXME use a better test */
printk(KERN_WARNING "md: bitmaps only support for raid1\n");
return -EINVAL;
Expand Down Expand Up @@ -3517,7 +3517,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
*/
void md_write_start(mddev_t *mddev, struct bio *bi)
{
DEFINE_WAIT(w);
if (bio_data_dir(bi) != WRITE)
return;

Expand Down
133 changes: 123 additions & 10 deletions drivers/md/raid5.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include <linux/bitops.h>
#include <asm/atomic.h>

#include <linux/raid/bitmap.h>

/*
* Stripe cache
*/
Expand Down Expand Up @@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
else
else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
conf->seq_write == sh->bm_seq)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
}
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
Expand Down Expand Up @@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
spin_lock_irq(&conf->device_lock);

do {
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0,
conf->device_lock, /* nothing */);
sh = __find_stripe(conf, sector);
if (!sh) {
if (!conf->inactive_blocked)
Expand Down Expand Up @@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
{
struct bio **bip;
raid5_conf_t *conf = sh->raid_conf;
int firstwrite=0;

PRINTK("adding bh b#%llu to stripe s#%llu\n",
(unsigned long long)bi->bi_sector,
Expand All @@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in

spin_lock(&sh->lock);
spin_lock_irq(&conf->device_lock);
if (forwrite)
if (forwrite) {
bip = &sh->dev[dd_idx].towrite;
else
if (*bip == NULL && sh->dev[dd_idx].written == NULL)
firstwrite = 1;
} else
bip = &sh->dev[dd_idx].toread;
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
Expand All @@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
(unsigned long long)bi->bi_sector,
(unsigned long long)sh->sector, dd_idx);

if (conf->mddev->bitmap && firstwrite) {
sh->bm_seq = conf->seq_write;
bitmap_startwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0);
set_bit(STRIPE_BIT_DELAY, &sh->state);
}

if (forwrite) {
/* check if page is covered */
sector_t sector = sh->dev[dd_idx].sector;
Expand Down Expand Up @@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh)
* need to be failed
*/
if (failed > 1 && to_read+to_write+written) {
spin_lock_irq(&conf->device_lock);
for (i=disks; i--; ) {
int bitmap_end = 0;
spin_lock_irq(&conf->device_lock);
/* fail all writes first */
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
if (bi) to_write--;
if (bi) { to_write--; bitmap_end = 1; }

if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
Expand All @@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh)
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
if (bi) bitmap_end = 1;
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
Expand Down Expand Up @@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh)
bi = nextbi;
}
}
spin_unlock_irq(&conf->device_lock);
if (bitmap_end)
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0, 0);
}
spin_unlock_irq(&conf->device_lock);
}
if (failed > 1 && syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
Expand Down Expand Up @@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh)
test_bit(R5_UPTODATE, &dev->flags) ) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
int bitmap_end = 0;
PRINTK("Return write for disc %d\n", i);
spin_lock_irq(&conf->device_lock);
wbi = dev->written;
Expand All @@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh)
}
wbi = wbi2;
}
if (dev->towrite == NULL)
bitmap_end = 1;
spin_unlock_irq(&conf->device_lock);
if (bitmap_end)
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state), 0);
}
}
}
Expand Down Expand Up @@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh)
}
}
/* now if nothing is locked, and if we have enough data, we can start a write request */
if (locked == 0 && (rcw == 0 ||rmw == 0)) {
if (locked == 0 && (rcw == 0 ||rmw == 0) &&
!test_bit(STRIPE_BIT_DELAY, &sh->state)) {
PRINTK("Computing parity...\n");
compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
/* now every locked buffer is ready to be written */
Expand Down Expand Up @@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh)
dev = &sh->dev[failed_num];
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
clear_bit(STRIPE_DEGRADED, &sh->state);
locked++;
set_bit(STRIPE_INSYNC, &sh->state);
set_bit(R5_Syncio, &dev->flags);
Expand Down Expand Up @@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh)
bi->bi_next = NULL;
generic_make_request(bi);
} else {
if (rw == 1)
set_bit(STRIPE_DEGRADED, &sh->state);
PRINTK("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
Expand All @@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
}
}

static inline void activate_bit_delay(raid5_conf_t *conf)
{
/* device_lock is held */
struct list_head head;
list_add(&head, &conf->bitmap_list);
list_del_init(&conf->bitmap_list);
while (!list_empty(&head)) {
struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
list_del_init(&sh->lru);
atomic_inc(&sh->count);
__release_stripe(conf, sh);
}
}

static void unplug_slaves(mddev_t *mddev)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
Expand Down Expand Up @@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q)

spin_lock_irqsave(&conf->device_lock, flags);

if (blk_remove_plug(q))
if (blk_remove_plug(q)) {
conf->seq_flush++;
raid5_activate_delayed(conf);
}
md_wakeup_thread(mddev->thread);

spin_unlock_irqrestore(&conf->device_lock, flags);
Expand Down Expand Up @@ -1493,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
sector_t first_sector;
int raid_disks = conf->raid_disks;
int data_disks = raid_disks-1;
sector_t max_sector = mddev->size << 1;
int sync_blocks;

if (sector_nr >= mddev->size <<1) {
if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */
unplug_slaves(mddev);

if (mddev->curr_resync < max_sector) /* aborted */
bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
else /* compelted sync */
conf->fullsync = 0;
bitmap_close_sync(mddev->bitmap);

return 0;
}
/* if there is 1 or more failed drives and we are trying
Expand All @@ -1508,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
*skipped = 1;
return rv;
}
if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
!conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
/* we can skip this block, and probably more */
sync_blocks /= STRIPE_SECTORS;
*skipped = 1;
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}

x = sector_nr;
chunk_offset = sector_div(x, sectors_per_chunk);
Expand All @@ -1525,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(1);
}
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
spin_lock(&sh->lock);
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
Expand Down Expand Up @@ -1558,6 +1628,13 @@ static void raid5d (mddev_t *mddev)
while (1) {
struct list_head *first;

if (conf->seq_flush - conf->seq_write > 0) {
int seq = conf->seq_flush;
bitmap_unplug(mddev->bitmap);
conf->seq_write = seq;
activate_bit_delay(conf);
}

if (list_empty(&conf->handle_list) &&
atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
!blk_queue_plugged(mddev->queue) &&
Expand Down Expand Up @@ -1591,7 +1668,7 @@ static void raid5d (mddev_t *mddev)
PRINTK("--- raid5d inactive\n");
}

static int run (mddev_t *mddev)
static int run(mddev_t *mddev)
{
raid5_conf_t *conf;
int raid_disk, memory;
Expand Down Expand Up @@ -1621,6 +1698,7 @@ static int run (mddev_t *mddev)
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
INIT_LIST_HEAD(&conf->inactive_list);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
Expand Down Expand Up @@ -1732,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +

/* Ok, everything is just fine now */

if (mddev->bitmap)
mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;

mddev->queue->unplug_fn = raid5_unplug_device;
mddev->queue->issue_flush_fn = raid5_issue_flush;

Expand Down Expand Up @@ -1912,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->in_sync = 0;
rdev->raid_disk = disk;
found = 1;
if (rdev->saved_raid_disk != disk)
conf->fullsync = 1;
p->rdev = rdev;
break;
}
Expand Down Expand Up @@ -1941,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return 0;
}

static void raid5_quiesce(mddev_t *mddev, int state)
{
raid5_conf_t *conf = mddev_to_conf(mddev);

switch(state) {
case 1: /* stop all writes */
spin_lock_irq(&conf->device_lock);
conf->quiesce = 1;
wait_event_lock_irq(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0,
conf->device_lock, /* nothing */);
spin_unlock_irq(&conf->device_lock);
break;

case 0: /* re-enable writes */
spin_lock_irq(&conf->device_lock);
conf->quiesce = 0;
wake_up(&conf->wait_for_stripe);
spin_unlock_irq(&conf->device_lock);
break;
}
if (mddev->thread) {
if (mddev->bitmap)
mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
else
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
md_wakeup_thread(mddev->thread);
}
}
static mdk_personality_t raid5_personality=
{
.name = "raid5",
Expand All @@ -1955,6 +2067,7 @@ static mdk_personality_t raid5_personality=
.spare_active = raid5_spare_active,
.sync_request = sync_request,
.resize = raid5_resize,
.quiesce = raid5_quiesce,
};

static int __init raid5_init (void)
Expand Down
14 changes: 13 additions & 1 deletion include/linux/raid/raid5.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ struct stripe_head {
unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */
spinlock_t lock;
int bm_seq; /* sequence number for bitmap flushes */
struct r5dev {
struct bio req;
struct bio_vec vec;
Expand Down Expand Up @@ -165,12 +166,13 @@ struct stripe_head {
/*
* Stripe state
*/
#define STRIPE_ERROR 1
#define STRIPE_HANDLE 2
#define STRIPE_SYNCING 3
#define STRIPE_INSYNC 4
#define STRIPE_PREREAD_ACTIVE 5
#define STRIPE_DELAYED 6
#define STRIPE_DEGRADED 7
#define STRIPE_BIT_DELAY 8

/*
* Plugging:
Expand Down Expand Up @@ -210,10 +212,20 @@ struct raid5_private_data {

struct list_head handle_list; /* stripes needing handling */
struct list_head delayed_list; /* stripes that have plugged requests */
struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
atomic_t preread_active_stripes; /* stripes with scheduled io */

char cache_name[20];
kmem_cache_t *slab_cache; /* for allocating stripes */

int seq_flush, seq_write;
int quiesce;

int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/

/*
* Free stripes pool
*/
Expand Down

0 comments on commit 7262668

Please sign in to comment.