Skip to content

Commit

Permalink
md/raid1: avoid writing to known-bad blocks on known-bad drives.
Browse files Browse the repository at this point in the history
If we have seen any write error on a drive, then don't write to
any known-bad blocks on that drive.
If necessary, we divide the write request up into pieces just
like we do for reads, so each piece is either all written or
all not written to any given drive.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
  • Loading branch information
NeilBrown committed Jul 28, 2011
1 parent 0b7d838 commit 1f68f0c
Showing 1 changed file with 115 additions and 38 deletions.
153 changes: 115 additions & 38 deletions drivers/md/raid1.c
Original file line number Diff line number Diff line change
Expand Up @@ -764,14 +764,17 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mirror_info_t *mirror;
r1bio_t *r1_bio;
struct bio *read_bio;
int i, targets = 0, disks;
int i, disks;
struct bitmap *bitmap;
unsigned long flags;
const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
mdk_rdev_t *blocked_rdev;
int plugged;
int first_clone;
int sectors_handled;
int max_sectors;

/*
* Register the new request and wait if the reconstruction
Expand Down Expand Up @@ -832,7 +835,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/*
* read balancing logic:
*/
int max_sectors;
int rdisk;

read_again:
Expand Down Expand Up @@ -872,7 +874,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/* could not read all from this device, so we will
* need another r1_bio.
*/
int sectors_handled;

sectors_handled = (r1_bio->sector + max_sectors
- bio->bi_sector);
Expand Down Expand Up @@ -906,34 +907,80 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/*
* WRITE:
*/
/* first select target devices under spinlock and
/* first select target devices under rcu_lock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
* If there are known/acknowledged bad blocks on any device on
* which we have seen a write error, we want to avoid writing those
* blocks.
* This potentially requires several writes to write around
* the bad blocks. Each set of writes gets it's own r1bio
* with a set of bios attached.
*/
plugged = mddev_check_plugged(mddev);

disks = conf->raid_disks;
retry_write:
blocked_rdev = NULL;
rcu_read_lock();
max_sectors = r1_bio->sectors;
for (i = 0; i < disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
break;
}
if (rdev && !test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
if (test_bit(Faulty, &rdev->flags)) {
r1_bio->bios[i] = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) {
set_bit(R1BIO_Degraded, &r1_bio->state);
continue;
}

atomic_inc(&rdev->nr_pending);
if (test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
int bad_sectors;
int is_bad;

is_bad = is_badblock(rdev, r1_bio->sector,
max_sectors,
&first_bad, &bad_sectors);
if (is_bad < 0) {
/* mustn't write here until the bad block is
* acknowledged*/
set_bit(BlockedBadBlocks, &rdev->flags);
blocked_rdev = rdev;
break;
}
if (is_bad && first_bad <= r1_bio->sector) {
/* Cannot write here at all */
bad_sectors -= (r1_bio->sector - first_bad);
if (bad_sectors < max_sectors)
/* mustn't write more than bad_sectors
* to other devices yet
*/
max_sectors = bad_sectors;
rdev_dec_pending(rdev, mddev);
r1_bio->bios[i] = NULL;
} else {
r1_bio->bios[i] = bio;
targets++;
/* We don't set R1BIO_Degraded as that
* only applies if the disk is
* missing, so it might be re-added,
* and we want to know to recover this
* chunk.
* In this case the device is here,
* and the fact that this chunk is not
* in-sync is recorded in the bad
* block log
*/
continue;
}
} else
r1_bio->bios[i] = NULL;
if (is_bad) {
int good_sectors = first_bad - r1_bio->sector;
if (good_sectors < max_sectors)
max_sectors = good_sectors;
}
}
r1_bio->bios[i] = bio;
}
rcu_read_unlock();

Expand All @@ -944,48 +991,56 @@ static int make_request(mddev_t *mddev, struct bio * bio)
for (j = 0; j < i; j++)
if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev);

r1_bio->state = 0;
allow_barrier(conf);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf);
goto retry_write;
}

if (targets < conf->raid_disks) {
/* array is degraded, we will not clear the bitmap
* on I/O completion (see raid1_end_write_request) */
set_bit(R1BIO_Degraded, &r1_bio->state);
if (max_sectors < r1_bio->sectors) {
/* We are splitting this write into multiple parts, so
* we need to prepare for allocating another r1_bio.
*/
r1_bio->sectors = max_sectors;
spin_lock_irq(&conf->device_lock);
if (bio->bi_phys_segments == 0)
bio->bi_phys_segments = 2;
else
bio->bi_phys_segments++;
spin_unlock_irq(&conf->device_lock);
}

/* do behind I/O ?
* Not if there are too many, or cannot allocate memory,
* or a reader on WriteMostly is waiting for behind writes
* to flush */
if (bitmap &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait))
alloc_behind_pages(bio, r1_bio);
sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;

atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);

bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
test_bit(R1BIO_BehindIO, &r1_bio->state));
first_clone = 1;
for (i = 0; i < disks; i++) {
struct bio *mbio;
if (!r1_bio->bios[i])
continue;

mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
r1_bio->bios[i] = mbio;

mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
mbio->bi_rw = WRITE | do_flush_fua | do_sync;
mbio->bi_private = r1_bio;

md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);

if (first_clone) {
/* do behind I/O ?
* Not if there are too many, or cannot
* allocate memory, or a reader on WriteMostly
* is waiting for behind writes to flush */
if (bitmap &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait))
alloc_behind_pages(mbio, r1_bio);

bitmap_startwrite(bitmap, r1_bio->sector,
r1_bio->sectors,
test_bit(R1BIO_BehindIO,
&r1_bio->state));
first_clone = 0;
}
if (r1_bio->behind_pages) {
struct bio_vec *bvec;
int j;
Expand All @@ -1003,6 +1058,15 @@ static int make_request(mddev_t *mddev, struct bio * bio)
atomic_inc(&r1_bio->behind_remaining);
}

r1_bio->bios[i] = mbio;

mbio->bi_sector = (r1_bio->sector +
conf->mirrors[i].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
mbio->bi_rw = WRITE | do_flush_fua | do_sync;
mbio->bi_private = r1_bio;

atomic_inc(&r1_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
Expand All @@ -1013,6 +1077,19 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);

if (sectors_handled < (bio->bi_size >> 9)) {
/* We need another r1_bio. It has already been counted
* in bio->bi_phys_segments
*/
r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
r1_bio->master_bio = bio;
r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
r1_bio->state = 0;
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector + sectors_handled;
goto retry_write;
}

if (do_sync || !bitmap || !plugged)
md_wakeup_thread(mddev->thread);

Expand Down

0 comments on commit 1f68f0c

Please sign in to comment.