Skip to content

Commit

Permalink
Merge tag 'md/4.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/…
Browse files Browse the repository at this point in the history
…git/shli/md

Pull MD fixes from Shaohua Li:

 - Several bug fixes for raid5-cache from Song Liu, mainly handle
   journal disk error

 - Fix bad block handling in choosing raid1 disk from Tomasz Majchrzak

 - Simplify external metadata array sysfs handling from Artur
   Paszkiewicz

 - Optimize raid0 discard handling from me, now raid0 will dispatch
   large discard IO directly to underlayer disks.

* tag 'md/4.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
  raid1: prefer disk without bad blocks
  md/r5cache: handle sync with data in write back cache
  md/r5cache: gracefully handle journal device errors for writeback mode
  md/raid1/10: avoid unnecessary locking
  md/raid5-cache: in r5l_do_submit_io(), submit io->split_bio first
  md/md0: optimize raid0 discard handling
  md: don't return -EAGAIN in md_allow_write for external metadata arrays
  md/raid5: make use of spin_lock_irq over local_irq_disable + spin_lock
  • Loading branch information
Linus Torvalds committed May 18, 2017
2 parents 667f867 + d82dd0e commit 8b4822d
Show file tree
Hide file tree
Showing 8 changed files with 209 additions and 86 deletions.
20 changes: 8 additions & 12 deletions drivers/md/md.c
Original file line number Diff line number Diff line change
Expand Up @@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end);
* may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held.
*
* In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
* is dropped, so return -EAGAIN after notifying userspace.
*/
int md_allow_write(struct mddev *mddev)
void md_allow_write(struct mddev *mddev)
{
if (!mddev->pers)
return 0;
return;
if (mddev->ro)
return 0;
return;
if (!mddev->pers->sync_request)
return 0;
return;

spin_lock(&mddev->lock);
if (mddev->in_sync) {
Expand All @@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev)
spin_unlock(&mddev->lock);
md_update_sb(mddev, 0);
sysfs_notify_dirent_safe(mddev->sysfs_state);
/* wait for the dirty state to be recorded in the metadata */
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
} else
spin_unlock(&mddev->lock);

if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
return -EAGAIN;
else
return 0;
}
EXPORT_SYMBOL_GPL(md_allow_write);

Expand Down
2 changes: 1 addition & 1 deletion drivers/md/md.h
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
bool metadata_op);
extern void md_do_sync(struct md_thread *thread);
extern void md_new_event(struct mddev *mddev);
extern int md_allow_write(struct mddev *mddev);
extern void md_allow_write(struct mddev *mddev);
extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(struct mddev *mddev);
Expand Down
116 changes: 102 additions & 14 deletions drivers/md/raid0.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);

blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
Expand Down Expand Up @@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
}
}

static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
{
struct r0conf *conf = mddev->private;
struct strip_zone *zone;
sector_t start = bio->bi_iter.bi_sector;
sector_t end;
unsigned int stripe_size;
sector_t first_stripe_index, last_stripe_index;
sector_t start_disk_offset;
unsigned int start_disk_index;
sector_t end_disk_offset;
unsigned int end_disk_index;
unsigned int disk;

zone = find_zone(conf, &start);

if (bio_end_sector(bio) > zone->zone_end) {
struct bio *split = bio_split(bio,
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
mddev->bio_set);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
end = zone->zone_end;
} else
end = bio_end_sector(bio);

if (zone != conf->strip_zone)
end = end - zone[-1].zone_end;

/* Now start and end is the offset in zone */
stripe_size = zone->nb_dev * mddev->chunk_sectors;

first_stripe_index = start;
sector_div(first_stripe_index, stripe_size);
last_stripe_index = end;
sector_div(last_stripe_index, stripe_size);

start_disk_index = (int)(start - first_stripe_index * stripe_size) /
mddev->chunk_sectors;
start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
mddev->chunk_sectors) +
first_stripe_index * mddev->chunk_sectors;
end_disk_index = (int)(end - last_stripe_index * stripe_size) /
mddev->chunk_sectors;
end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
mddev->chunk_sectors) +
last_stripe_index * mddev->chunk_sectors;

for (disk = 0; disk < zone->nb_dev; disk++) {
sector_t dev_start, dev_end;
struct bio *discard_bio = NULL;
struct md_rdev *rdev;

if (disk < start_disk_index)
dev_start = (first_stripe_index + 1) *
mddev->chunk_sectors;
else if (disk > start_disk_index)
dev_start = first_stripe_index * mddev->chunk_sectors;
else
dev_start = start_disk_offset;

if (disk < end_disk_index)
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
else if (disk > end_disk_index)
dev_end = last_stripe_index * mddev->chunk_sectors;
else
dev_end = end_disk_offset;

if (dev_end <= dev_start)
continue;

rdev = conf->devlist[(zone - conf->strip_zone) *
conf->strip_zone[0].nb_dev + disk];
if (__blkdev_issue_discard(rdev->bdev,
dev_start + zone->dev_start + rdev->data_offset,
dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
!discard_bio)
continue;
bio_chain(discard_bio, bio);
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
discard_bio, disk_devt(mddev->gendisk),
bio->bi_iter.bi_sector);
generic_make_request(discard_bio);
}
bio_endio(bio);
}

static void raid0_make_request(struct mddev *mddev, struct bio *bio)
{
struct strip_zone *zone;
Expand All @@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
return;
}

if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
raid0_handle_discard(mddev, bio);
return;
}

bio_sector = bio->bi_iter.bi_sector;
sector = bio_sector;
chunk_sects = mddev->chunk_sectors;
Expand All @@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;

if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
/* Just ignore it */
bio_endio(bio);
} else {
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
bio, disk_devt(mddev->gendisk),
bio_sector);
mddev_check_writesame(mddev, bio);
mddev_check_write_zeroes(mddev, bio);
generic_make_request(bio);
}
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
bio, disk_devt(mddev->gendisk),
bio_sector);
mddev_check_writesame(mddev, bio);
mddev_check_write_zeroes(mddev, bio);
generic_make_request(bio);
}

static void raid0_status(struct seq_file *seq, struct mddev *mddev)
Expand Down
21 changes: 10 additions & 11 deletions drivers/md/raid1.c
Original file line number Diff line number Diff line change
Expand Up @@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
break;
}
continue;
} else
} else {
if ((sectors > best_good_sectors) && (best_disk >= 0))
best_disk = -1;
best_good_sectors = sectors;
}

if (best_disk >= 0)
/* At least two disks to choose from so failfast is OK */
Expand Down Expand Up @@ -1529,17 +1532,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
plug = container_of(cb, struct raid1_plug_cb, cb);
else
plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
}
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!plug)
spin_unlock_irqrestore(&conf->device_lock, flags);
md_wakeup_thread(mddev->thread);
}
}

r1_bio_write_done(r1_bio);
Expand Down Expand Up @@ -3197,7 +3199,7 @@ static int raid1_reshape(struct mddev *mddev)
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
int d, d2, err;
int d, d2;

/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
Expand All @@ -3209,11 +3211,8 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL;
}

if (!mddev_is_clustered(mddev)) {
err = md_allow_write(mddev);
if (err)
return err;
}
if (!mddev_is_clustered(mddev))
md_allow_write(mddev);

raid_disks = mddev->raid_disks + mddev->delta_disks;

Expand Down
7 changes: 3 additions & 4 deletions drivers/md/raid10.c
Original file line number Diff line number Diff line change
Expand Up @@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
plug = container_of(cb, struct raid10_plug_cb, cb);
else
plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
}
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!plug)
spin_unlock_irqrestore(&conf->device_lock, flags);
md_wakeup_thread(mddev->thread);
}
}

static void raid10_write_request(struct mddev *mddev, struct bio *bio,
Expand Down
47 changes: 35 additions & 12 deletions drivers/md/raid5-cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "md.h"
#include "raid5.h"
#include "bitmap.h"
#include "raid5-log.h"

/*
* metadata/data stored in disk with 4k size unit (a block) regardless
Expand Down Expand Up @@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
spin_unlock_irqrestore(&log->io_list_lock, flags);

/*
* In case of journal device failures, submit_bio will get error
* and calls endio, then active stripes will continue write
* process. Therefore, it is not necessary to check Faulty bit
* of journal device here.
*
* We can't check split_bio after current_bio is submitted. If
* io->split_bio is null, after current_bio is submitted, current_bio
* might already be completed and the io_unit is freed. We submit
* split_bio first to avoid the issue.
*/
if (io->split_bio) {
if (io->has_flush)
io->split_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->split_bio->bi_opf |= REQ_FUA;
submit_bio(io->split_bio);
}

if (io->has_flush)
io->current_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->current_bio->bi_opf |= REQ_FUA;
submit_bio(io->current_bio);

if (!io->split_bio)
return;

if (io->has_flush)
io->split_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->split_bio->bi_opf |= REQ_FUA;
submit_bio(io->split_bio);
}

/* deferred io_unit will be dispatched here */
Expand Down Expand Up @@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
return;
pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
mdname(mddev));

/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));

mddev_suspend(mddev);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
Expand Down Expand Up @@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
* When run in degraded mode, array is set to write-through mode.
* This check helps drain pending write safely in the transition to
* write-through mode.
*
* When a stripe is syncing, the write is also handled in write
* through mode.
*/
if (s->failed) {
if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
r5c_make_stripe_write_out(sh);
return -EAGAIN;
}
Expand Down Expand Up @@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
}

r5l_append_flush_payload(log, sh->sector);
/* stripe is flused to raid disks, we can do resync now */
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
set_bit(STRIPE_HANDLE, &sh->state);
}

int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
Expand Down Expand Up @@ -2973,15 +2995,16 @@ static int r5l_load_log(struct r5l_log *log)
return ret;
}

void r5c_update_on_rdev_error(struct mddev *mddev)
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log;

if (!log)
return;

if (raid5_calc_degraded(conf) > 0 &&
if ((raid5_calc_degraded(conf) > 0 ||
test_bit(Journal, &rdev->flags)) &&
conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work);
}
Expand Down
3 changes: 2 additions & 1 deletion drivers/md/raid5-log.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
extern void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode;
extern void r5c_update_on_rdev_error(struct mddev *mddev);
extern void r5c_update_on_rdev_error(struct mddev *mddev,
struct md_rdev *rdev);
extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);

extern struct dma_async_tx_descriptor *
Expand Down
Loading

0 comments on commit 8b4822d

Please sign in to comment.