diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c01d41198f5e6..b092c7b5282f9 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev) sector_t start = 0; sector_t sector = 0; struct bitmap *bitmap = mddev->bitmap; + struct md_rdev *rdev; if (!bitmap) goto out; + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, true); + if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); @@ -2462,12 +2468,26 @@ static ssize_t backlog_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long backlog; + unsigned long old_mwb = mddev->bitmap_info.max_write_behind; int rv = kstrtoul(buf, 10, &backlog); if (rv) return rv; if (backlog > COUNTER_MAX) return -EINVAL; mddev->bitmap_info.max_write_behind = backlog; + if (!backlog && mddev->wb_info_pool) { + /* wb_info_pool is not needed if backlog is zero */ + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + } else if (backlog && !mddev->wb_info_pool) { + /* wb_info_pool is needed since backlog is not zero */ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, false); + } + if (old_mwb != backlog) + md_bitmap_update_sb(mddev->bitmap); return len; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 1f37a1adc9266..692fc365e73c8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -37,6 +37,7 @@ */ +#include #include #include #include @@ -124,6 +125,77 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } +static int rdev_init_wb(struct md_rdev *rdev) +{ + if (rdev->bdev->bd_queue->nr_hw_queues == 1) + return 0; + + spin_lock_init(&rdev->wb_list_lock); + INIT_LIST_HEAD(&rdev->wb_list); + init_waitqueue_head(&rdev->wb_io_wait); + set_bit(WBCollisionCheck, &rdev->flags); + + return 1; +} + +/* + * Create wb_info_pool if rdev is the first multi-queue device flaged + * with writemostly, also write-behind mode is enabled. + */ +void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) +{ + if (mddev->bitmap_info.max_write_behind == 0) + return; + + if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev)) + return; + + if (mddev->wb_info_pool == NULL) { + unsigned int noio_flag; + + if (!is_suspend) + mddev_suspend(mddev); + noio_flag = memalloc_noio_save(); + mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + memalloc_noio_restore(noio_flag); + if (!mddev->wb_info_pool) + pr_err("can't alloc memory pool for writemostly\n"); + if (!is_suspend) + mddev_resume(mddev); + } +} +EXPORT_SYMBOL_GPL(mddev_create_wb_pool); + +/* + * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck. + */ +static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags)) + return; + + if (mddev->wb_info_pool) { + struct md_rdev *temp; + int num = 0; + + /* + * Check if other rdevs need wb_info_pool. + */ + rdev_for_each(temp, mddev) + if (temp != rdev && + test_bit(WBCollisionCheck, &temp->flags)) + num++; + if (!num) { + mddev_suspend(rdev->mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + mddev_resume(rdev->mddev); + } + } +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -2210,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) rdev->mddev = mddev; pr_debug("md: bind<%s>\n", b); + if (mddev->raid_disks) + mddev_create_wb_pool(mddev, rdev, false); + if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2246,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); + mddev_destroy_wb_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2758,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); + mddev_create_wb_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { + mddev_destroy_wb_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { @@ -5588,15 +5666,28 @@ int md_run(struct mddev *mddev) mddev->bitmap = bitmap; } - if (err) { - mddev_detach(mddev); - if (mddev->private) - pers->free(mddev, mddev->private); - mddev->private = NULL; - module_put(pers->owner); - md_bitmap_destroy(mddev); - goto abort; + if (err) + goto bitmap_abort; + + if (mddev->bitmap_info.max_write_behind > 0) { + bool creat_pool = false; + + rdev_for_each(rdev, mddev) { + if (test_bit(WriteMostly, &rdev->flags) && + rdev_init_wb(rdev)) + creat_pool = true; + } + if (creat_pool && mddev->wb_info_pool == NULL) { + mddev->wb_info_pool = + mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + if (!mddev->wb_info_pool) { + err = -ENOMEM; + goto bitmap_abort; + } + } } + if (mddev->queue) { bool nonrot = true; @@ -5657,6 +5748,13 @@ int md_run(struct mddev *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; +bitmap_abort: + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + module_put(pers->owner); + md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); @@ -5825,6 +5923,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index 7c930c0911932..10f98200e2f8a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -109,6 +109,14 @@ struct md_rdev { * for reporting to userspace and storing * in superblock. */ + + /* + * The members for check collision of write behind IOs. + */ + struct list_head wb_list; + spinlock_t wb_list_lock; + wait_queue_head_t wb_io_wait; + struct work_struct del_work; /* used for delayed sysfs removal */ struct kernfs_node *sysfs_state; /* handle for 'state' @@ -193,6 +201,10 @@ enum flag_bits { * it didn't fail, so don't use FailFast * any more for metadata */ + WBCollisionCheck, /* + * multiqueue device should check if there + * is collision between write behind bios. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -245,6 +257,14 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; +#define NR_WB_INFOS 8 +/* record current range of write behind IOs */ +struct wb_info { + sector_t lo; + sector_t hi; + struct list_head list; +}; + struct mddev { void *private; struct md_personality *pers; @@ -461,6 +481,7 @@ struct mddev { */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ + mempool_t *wb_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ @@ -709,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); +extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a7860b5f33f2d..3d44da6637973 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" +static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi, *temp_wi; + unsigned long flags; + int ret = 0; + struct mddev *mddev = rdev->mddev; + + wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO); + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(temp_wi, &rdev->wb_list, list) { + /* collision happened */ + if (hi > temp_wi->lo && lo < temp_wi->hi) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + wi->lo = lo; + wi->hi = hi; + list_add(&wi->list, &rdev->wb_list); + } else + mempool_free(wi, mddev->wb_info_pool); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + + return ret; +} + +static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi; + unsigned long flags; + int found = 0; + struct mddev *mddev = rdev->mddev; + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(wi, &rdev->wb_list, list) + if (hi == wi->hi && lo == wi->lo) { + list_del(&wi->list); + mempool_free(wi, mddev->wb_info_pool); + found = 1; + break; + } + + if (!found) + WARN_ON("The write behind IO is not recorded\n"); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + wake_up(&rdev->wb_io_wait); +} + /* * for resync bio, r1bio pointer can be retrieved from the per-bio * 'struct resync_pages'. @@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + remove_wb(rdev, lo, hi); + } if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { - if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + struct md_rdev *rdev = conf->mirrors[i].rdev; + + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + wait_event(rdev->wb_io_wait, + check_and_add_wb(rdev, lo, hi) == 0); + } + if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); }