From e2218350465e7e0931676b4849b594c978437bce Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 May 2010 08:25:37 +1000 Subject: [PATCH 01/45] md: set mddev readonly flag on blkdev BLKROSET ioctl When the user sets the block device to readwrite then the mddev should follow suit. Otherwise, the BUG_ON in md_write_start() will be set to trigger. The reverse direction, setting mddev->ro to match a set readonly request, can be ignored because the blkdev level readonly flag precludes the need to have mddev->ro set correctly. Nevermind the fact that setting mddev->ro to 1 may fail if the array is in use. Cc: Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/md.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index a20a71e5efd3c..08f665178c3b0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5489,6 +5489,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, int err = 0; void __user *argp = (void __user *)arg; mddev_t *mddev = NULL; + int ro; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -5624,6 +5625,34 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = do_md_stop(mddev, 1, 1); goto done_unlock; + case BLKROSET: + if (get_user(ro, (int __user *)(arg))) { + err = -EFAULT; + goto done_unlock; + } + err = -EINVAL; + + /* if the bdev is going readonly the value of mddev->ro + * does not matter, no writes are coming + */ + if (ro) + goto done_unlock; + + /* are we are already prepared for writes? */ + if (mddev->ro != 1) + goto done_unlock; + + /* transitioning to readauto need only happen for + * arrays that call md_write_start + */ + if (mddev->pers) { + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); + } + } + goto done_unlock; } /* From ef2f80ff7325b2c1888ff02ead28957b5840bf51 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 17 May 2010 11:27:00 +1000 Subject: [PATCH 02/45] md/linear: avoid possible oops and array stop Since commit ef286f6fa673cd7fb367e1b145069d8dbfcc6081 it has been important that each personality clears ->private in the ->stop() function, or sets it to a attribute group to be removed. linear.c doesn't. This can sometimes lead to an oops, though it doesn't always. Suitable for 2.6.33-stable and 2.6.34. Signed-off-by: NeilBrown Cc: stable@kernel.org --- drivers/md/linear.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index bb2a23159b214..9db8ee0614a41 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -281,6 +281,7 @@ static int linear_stop (mddev_t *mddev) rcu_barrier(); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); + mddev->private = NULL; return 0; } From b6eb127d274385d81ce8dd45c98190f097bce1b4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 15 Apr 2010 10:13:47 +1000 Subject: [PATCH 03/45] md: remove unneeded sysfs files more promptly When an array is stopped we need to remove some sysfs files which are dependent on the type of array. We need to delay that deletion as deleting them while holding reconfig_mutex can lead to deadlocks. We currently delay them until the array is completely destroyed. However it is possible to deactivate and then reactivate the array. It is also possible to need to remove sysfs files when changing level, which can potentially happen several times before an array is destroyed. So we need to delete these files more promptly: as soon as reconfig_mutex is dropped. We need to ensure this happens before do_md_run can restart the array, so we use open_mutex for some extra locking. This is not deadlock prone. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 08f665178c3b0..edf777f6fe561 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -507,9 +507,32 @@ static inline int mddev_trylock(mddev_t * mddev) return mutex_trylock(&mddev->reconfig_mutex); } +static struct attribute_group md_redundancy_group; + static inline void mddev_unlock(mddev_t * mddev) { - mutex_unlock(&mddev->reconfig_mutex); + if (mddev->pers == NULL && mddev->private) { + /* These cannot be removed under reconfig_mutex as + * an access to the files will try to take reconfig_mutex + * while holding the file unremovable, which leads to + * a deadlock. + * So hold open_mutex instead - we are allowed to take + * it while holding reconfig_mutex, and md_run can + * use it to wait for the remove to complete. + */ + mutex_lock(&mddev->open_mutex); + mutex_unlock(&mddev->reconfig_mutex); + + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->private != (void*)1) + sysfs_remove_group(&mddev->kobj, mddev->private); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + mddev->private = NULL; + mutex_unlock(&mddev->open_mutex); + } else + mutex_unlock(&mddev->reconfig_mutex); md_wakeup_thread(mddev->thread); } @@ -4075,15 +4098,6 @@ static void mddev_delayed_delete(struct work_struct *ws) { mddev_t *mddev = container_of(ws, mddev_t, del_work); - if (mddev->private) { - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - if (mddev->private != (void*)1) - sysfs_remove_group(&mddev->kobj, mddev->private); - if (mddev->sysfs_action) - sysfs_put(mddev->sysfs_action); - mddev->sysfs_action = NULL; - mddev->private = NULL; - } sysfs_remove_group(&mddev->kobj, &md_bitmap_group); kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); @@ -4241,6 +4255,13 @@ static int do_md_run(mddev_t * mddev) if (mddev->pers) return -EBUSY; + /* These two calls synchronise us with the + * sysfs_remove_group calls in mddev_unlock, + * so they must have completed. + */ + mutex_lock(&mddev->open_mutex); + mutex_unlock(&mddev->open_mutex); + /* * Analyze all RAID superblock(s) */ From a64c876fd357906a1f7193723866562ad290654c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 14 Apr 2010 17:15:37 +1000 Subject: [PATCH 04/45] md: manage redundancy group in sysfs when changing level. Some levels expect the 'redundancy group' to be present, others don't. So when we change level of an array we might need to add or remove this group. This requires fixing up the current practice of overloading ->private to indicate (when ->pers == NULL) that something needs to be removed. So create a new ->to_remove to fill that role. When changing levels, we may need to add or remove attributes. When changing RAID5 -> RAID6, we both add and remove the same thing. It is important to catch this and optimise it out as the removal is delayed until a lock is released, so trying to add immediately would cause problems. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 43 ++++++++++++++++++++++++++++++++----------- drivers/md/md.h | 1 + drivers/md/raid5.c | 7 +++++-- 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index edf777f6fe561..e8d238885cd2e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -509,9 +509,9 @@ static inline int mddev_trylock(mddev_t * mddev) static struct attribute_group md_redundancy_group; -static inline void mddev_unlock(mddev_t * mddev) +static void mddev_unlock(mddev_t * mddev) { - if (mddev->pers == NULL && mddev->private) { + if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as * an access to the files will try to take reconfig_mutex * while holding the file unremovable, which leads to @@ -520,16 +520,20 @@ static inline void mddev_unlock(mddev_t * mddev) * it while holding reconfig_mutex, and md_run can * use it to wait for the remove to complete. */ + struct attribute_group *to_remove = mddev->to_remove; + mddev->to_remove = NULL; mutex_lock(&mddev->open_mutex); mutex_unlock(&mddev->reconfig_mutex); - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - if (mddev->private != (void*)1) - sysfs_remove_group(&mddev->kobj, mddev->private); - if (mddev->sysfs_action) - sysfs_put(mddev->sysfs_action); - mddev->sysfs_action = NULL; - mddev->private = NULL; + if (to_remove != &md_redundancy_group) + sysfs_remove_group(&mddev->kobj, to_remove); + if (mddev->pers == NULL || + mddev->pers->sync_request == NULL) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + } mutex_unlock(&mddev->open_mutex); } else mutex_unlock(&mddev->reconfig_mutex); @@ -2996,6 +3000,23 @@ level_store(mddev_t *mddev, const char *buf, size_t len) /* Looks like we have a winner */ mddev_suspend(mddev); mddev->pers->stop(mddev); + + if (mddev->pers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + } + if (mddev->pers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + module_put(mddev->pers->owner); /* Invalidate devices that are now superfluous */ list_for_each_entry(rdev, &mddev->disks, same_set) @@ -4550,8 +4571,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->queue->unplug_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; module_put(mddev->pers->owner); - if (mddev->pers->sync_request && mddev->private == NULL) - mddev->private = (void*)1; + if (mddev->pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; mddev->pers = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); diff --git a/drivers/md/md.h b/drivers/md/md.h index 8e4c75c00d462..722f5dfe1953e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -305,6 +305,7 @@ struct mddev_s atomic_t max_corr_read_errors; /* max read retries */ struct list_head all_mddevs; + struct attribute_group *to_remove; /* Generic barrier handling. * If there is a pending barrier request, all other * writes are blocked while the devices are flushed. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 70ffbd071b2e7..a361398875d0f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5090,7 +5090,9 @@ static int run(mddev_t *mddev) } /* Ok, everything is just fine now */ - if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) + if (mddev->to_remove == &raid5_attrs_group) + mddev->to_remove = NULL; + else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) printk(KERN_WARNING "raid5: failed to create sysfs attributes for %s\n", mdname(mddev)); @@ -5137,7 +5139,8 @@ static int stop(mddev_t *mddev) mddev->queue->backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ free_conf(conf); - mddev->private = &raid5_attrs_group; + mddev->private = NULL; + mddev->to_remove = &raid5_attrs_group; return 0; } From 964147d5c86d63be79b442c30f3783d49860c078 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 18 May 2010 15:27:13 +1000 Subject: [PATCH 05/45] md/raid1: fix counting of write targets. There is a very small race window when writing to a RAID1 such that if a device is marked faulty at exactly the wrong time, the write-in-progress will not be sent to the device, but the bitmap (if present) will be updated to say that the write was sent. Then if the device turned out to still be usable as was re-added to the array, the bitmap-based-resync would skip resyncing that block, possibly leading to corruption. This would only be a problem if no further writes were issued to that area of the device (i.e. that bitmap chunk). Suitable for any pending -stable kernel. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid1.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f741f77eeb2b4..1ab30f64848f7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -911,9 +911,10 @@ static int make_request(struct request_queue *q, struct bio * bio) if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); r1_bio->bios[i] = NULL; - } else + } else { r1_bio->bios[i] = bio; - targets++; + targets++; + } } else r1_bio->bios[i] = NULL; } From ee8b81b03dffa1c0075553d01c557714aedb85a1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 8 Mar 2010 16:02:36 +1100 Subject: [PATCH 06/45] md: remove some dead fields from mddev_s These fields have never been used. commit 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 added them, but also added identical files to bitmap_super_s, and only used the latter. So remove these unused fields. Signed-off-by: NeilBrown --- drivers/md/md.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/md/md.h b/drivers/md/md.h index 722f5dfe1953e..05145786b50fb 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -279,9 +279,6 @@ struct mddev_s atomic_t writes_pending; struct request_queue *queue; /* for plugging ... */ - atomic_t write_behind; /* outstanding async IO */ - unsigned int max_write_behind; /* 0 = sync */ - struct bitmap *bitmap; /* the bitmap for the device */ struct { struct file *file; /* the bitmap file */ From 696fcd535b5a8cfc0617e9cf1d9d69a13895cc1e Mon Sep 17 00:00:00 2001 From: Paul Clements Date: Mon, 8 Mar 2010 16:02:37 +1100 Subject: [PATCH 07/45] md: expose max value of behind writes counter Keep track of the maximum number of concurrent write-behind requests for an md array and exposed this number in sysfs at md/bitmap/max_backlog_used Writing any value to this file will clear it. This allows userspace to be involved in tuning bitmap/backlog. Signed-off-by: Paul Clements Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 29 ++++++++++++++++++++++++++++- drivers/md/bitmap.h | 1 + 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 26ac8aad0b199..6279393db64d0 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1292,9 +1292,14 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect if (!bitmap) return 0; if (behind) { + int bw; atomic_inc(&bitmap->behind_writes); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", - atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + bw, bitmap->max_write_behind); } while (sectors) { @@ -2006,6 +2011,27 @@ static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len) static struct md_sysfs_entry bitmap_can_clear = __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); +static ssize_t +behind_writes_used_show(mddev_t *mddev, char *page) +{ + if (mddev->bitmap == NULL) + return sprintf(page, "0\n"); + return sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); +} + +static ssize_t +behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap) + mddev->bitmap->behind_writes_used = 0; + return len; +} + +static struct md_sysfs_entry max_backlog_used = +__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, + behind_writes_used_show, behind_writes_used_reset); + static struct attribute *md_bitmap_attrs[] = { &bitmap_location.attr, &bitmap_timeout.attr, @@ -2013,6 +2039,7 @@ static struct attribute *md_bitmap_attrs[] = { &bitmap_chunksize.attr, &bitmap_metadata.attr, &bitmap_can_clear.attr, + &max_backlog_used.attr, NULL }; struct attribute_group md_bitmap_group = { diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index cb821d76d1b4e..aa82b7caa85f0 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -227,6 +227,7 @@ struct bitmap { int allclean; atomic_t behind_writes; + unsigned long behind_writes_used; /* highest actual value at runtime */ /* * the bitmap daemon - periodically wakes up and sweeps the bitmap From 7b92813c3c0b6990f14838e3985fb385d2655d0c Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Mon, 8 Mar 2010 16:02:40 +1100 Subject: [PATCH 08/45] drivers/md: Remove unnecessary casts of void * void pointers do not need to be cast to other pointer types. Signed-off-by: H Hartley Sweeten Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 8 ++++---- drivers/md/faulty.c | 6 +++--- drivers/md/multipath.c | 2 +- drivers/md/raid1.c | 8 ++++---- drivers/md/raid10.c | 8 ++++---- drivers/md/raid5.c | 10 +++++----- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6279393db64d0..49d6080387c88 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -505,7 +505,7 @@ void bitmap_update_sb(struct bitmap *bitmap) return; } spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) { /* rocking back to read-only */ @@ -526,7 +526,7 @@ void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->sb_page) return; - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); @@ -575,7 +575,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) return err; } - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; @@ -661,7 +661,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, return 0; } spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); old = le32_to_cpu(sb->state) & bits; switch (op) { case MASK_SET: sb->state |= cpu_to_le32(bits); diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 713acd02ab39f..608a8d3736e21 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -171,7 +171,7 @@ static void add_sector(conf_t *conf, sector_t start, int mode) static int make_request(struct request_queue *q, struct bio *bio) { mddev_t *mddev = q->queuedata; - conf_t *conf = (conf_t*)mddev->private; + conf_t *conf = mddev->private; int failit = 0; if (bio_data_dir(bio) == WRITE) { @@ -224,7 +224,7 @@ static int make_request(struct request_queue *q, struct bio *bio) static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = (conf_t*)mddev->private; + conf_t *conf = mddev->private; int n; if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) @@ -327,7 +327,7 @@ static int run(mddev_t *mddev) static int stop(mddev_t *mddev) { - conf_t *conf = (conf_t *)mddev->private; + conf_t *conf = mddev->private; kfree(conf); mddev->private = NULL; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 5558ebc705c87..97befd5cc0e30 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -84,7 +84,7 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) static void multipath_end_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); + struct multipath_bh *mp_bh = bio->bi_private; multipath_conf_t *conf = mp_bh->mddev->private; mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1ab30f64848f7..23a7516abbfd4 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -262,7 +262,7 @@ static inline void update_head_pos(int disk, r1bio_t *r1_bio) static void raid1_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; int mirror; conf_t *conf = r1_bio->mddev->private; @@ -307,7 +307,7 @@ static void raid1_end_read_request(struct bio *bio, int error) static void raid1_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); conf_t *conf = r1_bio->mddev->private; struct bio *to_put = NULL; @@ -1223,7 +1223,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) static void end_sync_read(struct bio *bio, int error) { - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; int i; for (i=r1_bio->mddev->raid_disks; i--; ) @@ -1246,7 +1246,7 @@ static void end_sync_read(struct bio *bio, int error) static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; mddev_t *mddev = r1_bio->mddev; conf_t *conf = mddev->private; int i; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b4ba41ecbd203..b90fef607f63f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -254,7 +254,7 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) static void raid10_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + r10bio_t *r10_bio = bio->bi_private; int slot, dev; conf_t *conf = r10_bio->mddev->private; @@ -295,7 +295,7 @@ static void raid10_end_read_request(struct bio *bio, int error) static void raid10_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + r10bio_t *r10_bio = bio->bi_private; int slot, dev; conf_t *conf = r10_bio->mddev->private; @@ -1223,7 +1223,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number) static void end_sync_read(struct bio *bio, int error) { - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + r10bio_t *r10_bio = bio->bi_private; conf_t *conf = r10_bio->mddev->private; int i,d; @@ -1260,7 +1260,7 @@ static void end_sync_read(struct bio *bio, int error) static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + r10bio_t *r10_bio = bio->bi_private; mddev_t *mddev = r10_bio->mddev; conf_t *conf = mddev->private; int i,d; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a361398875d0f..10af3715b1fca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1618,7 +1618,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; pr_debug("raid5: error called\n"); if (!test_bit(Faulty, &rdev->flags)) { @@ -4057,7 +4057,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * As the reads complete, handle_stripe will copy the data * into the destination stripe and release that stripe. */ - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; struct stripe_head *sh; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; @@ -4266,7 +4266,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped /* FIXME go_faster isn't used */ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; struct stripe_head *sh; sector_t max_sector = mddev->dev_sectors; int sync_blocks; @@ -5132,7 +5132,7 @@ static int run(mddev_t *mddev) static int stop(mddev_t *mddev) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; md_unregister_thread(mddev->thread); mddev->thread = NULL; @@ -5181,7 +5181,7 @@ static void printall(struct seq_file *seq, raid5_conf_t *conf) static void status(struct seq_file *seq, mddev_t *mddev) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; int i; seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, From c0cc75f84e0e413bce2dcabea74ef418da45c7c1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Mar 2010 10:28:51 +1100 Subject: [PATCH 09/45] md: discard StateChanged device flag. This was needed when sysfs files could only be 'notified' from process context. Now that we have sys_notify_direct, we can call it directly from an interrupt. Signed-off-by: NeilBrown --- drivers/md/md.c | 7 +------ drivers/md/md.h | 3 --- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e8d238885cd2e..2a64cba9ea72f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5970,7 +5970,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) mddev->pers->error_handler(mddev,rdev); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(StateChanged, &rdev->flags); + sysfs_notify_dirent(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -6962,11 +6962,6 @@ void md_check_recovery(mddev_t *mddev) if (mddev->flags) md_update_sb(mddev, 0); - list_for_each_entry(rdev, &mddev->disks, same_set) - if (test_and_clear_bit(StateChanged, &rdev->flags)) - sysfs_notify_dirent(rdev->sysfs_state); - - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { /* resync/recovery still happening */ diff --git a/drivers/md/md.h b/drivers/md/md.h index 05145786b50fb..e4836c68b73e0 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -74,9 +74,6 @@ struct mdk_rdev_s #define Blocked 8 /* An error occured on an externally * managed array, don't allow writes * until it is cleared */ -#define StateChanged 9 /* Faulty or Blocked has changed during - * interrupt, so it needs to be - * notified by the thread */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ From 84707f38e767ac470fd82af6c45a8cafe2bd1b9a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 16 Mar 2010 17:23:35 +1100 Subject: [PATCH 10/45] md: don't use mddev->raid_disks in raid0 or raid10 while array is active. In a subsequent patch we will make it possible to change mddev->raid_disks while a RAID0 or RAID10 array is active. This is part of the process of reshaping such an array. This means that we cannot use this value while processes requests (it is OK to use it during initialisation as we are locked against changes then). Both RAID0 and RAID10 have the same value stored in the private data structure, so use that value instead. Signed-off-by: NeilBrown --- drivers/md/raid0.c | 15 ++++++++++----- drivers/md/raid10.c | 16 ++++++++-------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 377cf2a3c3331..c2e0d1d281023 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -28,9 +28,10 @@ static void raid0_unplug(struct request_queue *q) mddev_t *mddev = q->queuedata; raid0_conf_t *conf = mddev->private; mdk_rdev_t **devlist = conf->devlist; + int raid_disks = conf->strip_zone[0].nb_dev; int i; - for (i=0; iraid_disks; i++) { + for (i=0; i < raid_disks; i++) { struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); blk_unplug(r_queue); @@ -42,12 +43,13 @@ static int raid0_congested(void *data, int bits) mddev_t *mddev = data; raid0_conf_t *conf = mddev->private; mdk_rdev_t **devlist = conf->devlist; + int raid_disks = conf->strip_zone[0].nb_dev; int i, ret = 0; if (mddev_congested(mddev, bits)) return 1; - for (i = 0; i < mddev->raid_disks && !ret ; i++) { + for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); @@ -65,6 +67,7 @@ static void dump_zones(mddev_t *mddev) sector_t zone_start = 0; char b[BDEVNAME_SIZE]; raid0_conf_t *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; printk(KERN_INFO "******* %s configuration *********\n", mdname(mddev)); h = 0; @@ -72,7 +75,7 @@ static void dump_zones(mddev_t *mddev) printk(KERN_INFO "zone%d=[", j); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) printk("%s/", - bdevname(conf->devlist[j*mddev->raid_disks + bdevname(conf->devlist[j*raid_disks + k]->bdev, b)); printk("]\n"); @@ -401,6 +404,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, unsigned int sect_in_chunk; sector_t chunk; raid0_conf_t *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; unsigned int chunk_sects = mddev->chunk_sectors; if (is_power_of_2(chunk_sects)) { @@ -423,7 +427,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, * + the position in the chunk */ *sector_offset = (chunk * chunk_sects) + sect_in_chunk; - return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks + return conf->devlist[(zone - conf->strip_zone)*raid_disks + sector_div(sector, zone->nb_dev)]; } @@ -518,6 +522,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) int j, k, h; char b[BDEVNAME_SIZE]; raid0_conf_t *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; sector_t zone_size; sector_t zone_start = 0; @@ -528,7 +533,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) seq_printf(seq, "=["); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) seq_printf(seq, "%s/", bdevname( - conf->devlist[j*mddev->raid_disks + k] + conf->devlist[j*raid_disks + k] ->bdev, b)); zone_size = conf->strip_zone[j].zone_end - zone_start; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b90fef607f63f..044c1157d98d8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -600,7 +600,7 @@ static void unplug_slaves(mddev_t *mddev) int i; rcu_read_lock(); - for (i=0; iraid_disks; i++) { + for (i=0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { struct request_queue *r_queue = bdev_get_queue(rdev->bdev); @@ -634,7 +634,7 @@ static int raid10_congested(void *data, int bits) if (mddev_congested(mddev, bits)) return 1; rcu_read_lock(); - for (i = 0; i < mddev->raid_disks && ret == 0; i++) { + for (i = 0; i < conf->raid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -1131,7 +1131,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int mirror; mirror_info_t *p; int first = 0; - int last = mddev->raid_disks - 1; + int last = conf->raid_disks - 1; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is @@ -2139,7 +2139,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) conf_t *conf = mddev->private; if (!raid_disks) - raid_disks = mddev->raid_disks; + raid_disks = conf->raid_disks; if (!sectors) sectors = mddev->dev_sectors; @@ -2250,7 +2250,7 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; - if (disk_idx >= mddev->raid_disks + if (disk_idx >= conf->raid_disks || disk_idx < 0) continue; disk = conf->mirrors + disk_idx; @@ -2311,8 +2311,8 @@ static int run(mddev_t *mddev) mdname(mddev)); printk(KERN_INFO "raid10: raid set %s active with %d out of %d devices\n", - mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks); + mdname(mddev), conf->raid_disks - mddev->degraded, + conf->raid_disks); /* * Ok, everything is just fine now */ @@ -2335,7 +2335,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - if (conf->near_copies < mddev->raid_disks) + if (conf->near_copies < conf->raid_disks) blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); md_integrity_register(mddev); return 0; From 54071b3808ee3dc8624d9d6f1b06c4fd5308fa3b Mon Sep 17 00:00:00 2001 From: Trela Maciej Date: Mon, 8 Mar 2010 16:02:42 +1100 Subject: [PATCH 11/45] md:Add support for Raid0->Raid5 takeover Signed-off-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/md.c | 14 ++++++++++++++ drivers/md/raid5.c | 26 ++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2a64cba9ea72f..22c630b7ba6ca 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3017,6 +3017,20 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->to_remove = &md_redundancy_group; } + if (mddev->pers->sync_request == NULL && + mddev->external) { + /* We are converting from a no-redundancy array + * to a redundancy array and metadata is managed + * externally so we need to be sure that writes + * won't block due to a need to transition + * clean->dirty + * until external management is started. + */ + mddev->in_sync = 0; + mddev->safemode_delay = 0; + mddev->safemode = 0; + } + module_put(mddev->pers->owner); /* Invalidate devices that are now superfluous */ list_for_each_entry(rdev, &mddev->disks, same_set) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 10af3715b1fca..bb28fd6b44fee 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -52,6 +52,7 @@ #include #include "md.h" #include "raid5.h" +#include "raid0.h" #include "bitmap.h" /* @@ -5619,6 +5620,21 @@ static void raid5_quiesce(mddev_t *mddev, int state) } +static void *raid5_takeover_raid0(mddev_t *mddev) +{ + + mddev->new_level = 5; + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks += 1; + mddev->delta_disks = 1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + return setup_conf(mddev); +} + + static void *raid5_takeover_raid1(mddev_t *mddev) { int chunksect; @@ -5748,6 +5764,16 @@ static void *raid5_takeover(mddev_t *mddev) * raid4 - trivial - just use a raid4 layout. * raid6 - Providing it is a *_6 layout */ + if (mddev->level == 0) { + /* for raid0 takeover only one zone is supported */ + struct raid0_private_data *raid0_priv + = mddev->private; + if (raid0_priv->nr_strip_zones > 1) { + printk(KERN_ERR "md: cannot takeover raid 0 with more than one zone.\n"); + return ERR_PTR(-EINVAL); + } + return raid5_takeover_raid0(mddev); + } if (mddev->level == 1) return raid5_takeover_raid1(mddev); From 9af204cf720cedf369cf823bbd806c350201f7ea Mon Sep 17 00:00:00 2001 From: "Trela, Maciej" Date: Mon, 8 Mar 2010 16:02:44 +1100 Subject: [PATCH 12/45] md: Add support for Raid5->Raid0 and Raid10->Raid0 takeover Signed-off-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/md.c | 7 +++ drivers/md/raid0.c | 125 ++++++++++++++++++++++++++++++++++++++++++--- drivers/md/raid0.h | 3 ++ 3 files changed, 129 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 22c630b7ba6ca..7dcc740895507 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3045,6 +3045,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; + if (mddev->pers->sync_request == NULL) { + /* this is now an array without redundancy, so + * it must always be in_sync + */ + mddev->in_sync = 1; + del_timer_sync(&mddev->safemode_timer); + } pers->run(mddev); mddev_resume(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c2e0d1d281023..afddf624bad30 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -22,6 +22,7 @@ #include #include "md.h" #include "raid0.h" +#include "raid5.h" static void raid0_unplug(struct request_queue *q) { @@ -90,7 +91,7 @@ static void dump_zones(mddev_t *mddev) printk(KERN_INFO "**********************************\n\n"); } -static int create_strip_zones(mddev_t *mddev) +static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) { int i, c, err; sector_t curr_zone_end, sectors; @@ -164,6 +165,10 @@ static int create_strip_zones(mddev_t *mddev) list_for_each_entry(rdev1, &mddev->disks, same_set) { int j = rdev1->raid_disk; + if (mddev->level == 10) + /* taking over a raid10-n2 array */ + j /= 2; + if (j < 0 || j >= mddev->raid_disks) { printk(KERN_ERR "raid0: bad disk number %d - " "aborting!\n", j); @@ -264,13 +269,14 @@ static int create_strip_zones(mddev_t *mddev) (mddev->chunk_sectors << 9) * mddev->raid_disks); printk(KERN_INFO "raid0: done.\n"); - mddev->private = conf; + *private_conf = conf; + return 0; abort: kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); - mddev->private = NULL; + *private_conf = NULL; return err; } @@ -321,6 +327,7 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) static int raid0_run(mddev_t *mddev) { + raid0_conf_t *conf; int ret; if (mddev->chunk_sectors == 0) { @@ -332,9 +339,20 @@ static int raid0_run(mddev_t *mddev) blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); mddev->queue->queue_lock = &mddev->queue->__queue_lock; - ret = create_strip_zones(mddev); - if (ret < 0) - return ret; + /* if private is not null, we are here after takeover */ + if (mddev->private == NULL) { + ret = create_strip_zones(mddev, &conf); + if (ret < 0) + return ret; + mddev->private = conf; + } + conf = mddev->private; + if (conf->scale_raid_disks) { + int i; + for (i=0; i < conf->strip_zone[0].nb_dev; i++) + conf->devlist[i]->raid_disk /= conf->scale_raid_disks; + /* FIXME update sysfs rd links */ + } /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); @@ -548,6 +566,99 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) return; } +static void *raid0_takeover_raid5(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + raid0_conf_t *priv_conf; + + if (mddev->degraded != 1) { + printk(KERN_ERR "md: raid5 must be degraded! Degraded disks: %d\n", + mddev->degraded); + return ERR_PTR(-EINVAL); + } + + list_for_each_entry(rdev, &mddev->disks, same_set) { + /* check slot number for a disk */ + if (rdev->raid_disk == mddev->raid_disks-1) { + printk(KERN_ERR "md: raid5 must have missing parity disk!\n"); + return ERR_PTR(-EINVAL); + } + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks--; + mddev->delta_disks = -1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + +static void *raid0_takeover_raid10(mddev_t *mddev) +{ + raid0_conf_t *priv_conf; + + /* Check layout: + * - far_copies must be 1 + * - near_copies must be 2 + * - disks number must be even + * - all mirrors must be already degraded + */ + if (mddev->layout != ((1 << 8) + 2)) { + printk(KERN_ERR "md: Raid0 cannot takover layout: %x\n", + mddev->layout); + return ERR_PTR(-EINVAL); + } + if (mddev->raid_disks & 1) { + printk(KERN_ERR "md: Raid0 cannot takover Raid10 with odd disk number.\n"); + return ERR_PTR(-EINVAL); + } + if (mddev->degraded != (mddev->raid_disks>>1)) { + printk(KERN_ERR "md: All mirrors must be already degraded!\n"); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = - mddev->raid_disks / 2; + mddev->raid_disks += mddev->delta_disks; + mddev->degraded = 0; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + priv_conf->scale_raid_disks = 2; + return priv_conf; +} + +static void *raid0_takeover(mddev_t *mddev) +{ + /* raid0 can take over: + * raid5 - providing it is Raid4 layout and one disk is faulty + * raid10 - assuming we have all necessary active disks + */ + if (mddev->level == 5) { + if (mddev->layout == ALGORITHM_PARITY_N) + return raid0_takeover_raid5(mddev); + + printk(KERN_ERR "md: Raid can only takeover Raid5 with layout: %d\n", + ALGORITHM_PARITY_N); + } + + if (mddev->level == 10) + return raid0_takeover_raid10(mddev); + + return ERR_PTR(-EINVAL); +} + +static void raid0_quiesce(mddev_t *mddev, int state) +{ +} + static struct mdk_personality raid0_personality= { .name = "raid0", @@ -558,6 +669,8 @@ static struct mdk_personality raid0_personality= .stop = raid0_stop, .status = raid0_status, .size = raid0_size, + .takeover = raid0_takeover, + .quiesce = raid0_quiesce, }; static int __init raid0_init (void) diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 91f8e876ee645..d724e664ca4da 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -13,6 +13,9 @@ struct raid0_private_data struct strip_zone *strip_zone; mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ int nr_strip_zones; + int scale_raid_disks; /* divide rdev->raid_disks by this in run() + * to handle conversion from raid10 + */ }; typedef struct raid0_private_data raid0_conf_t; From dab8b29248b3f14f456651a2a6ee9b8fd16d1b3c Mon Sep 17 00:00:00 2001 From: "Trela, Maciej" Date: Mon, 8 Mar 2010 16:02:45 +1100 Subject: [PATCH 13/45] md: Add support for Raid0->Raid10 takeover Signed-off-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/raid10.c | 194 ++++++++++++++++++++++++++++++++------------ drivers/md/raid10.h | 12 +++ 2 files changed, 155 insertions(+), 51 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 044c1157d98d8..57d71d5d88f45 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -23,6 +23,7 @@ #include #include "md.h" #include "raid10.h" +#include "raid0.h" #include "bitmap.h" /* @@ -2141,7 +2142,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) if (!raid_disks) raid_disks = conf->raid_disks; if (!sectors) - sectors = mddev->dev_sectors; + sectors = conf->dev_sectors; size = sectors >> conf->chunk_shift; sector_div(size, conf->far_copies); @@ -2151,62 +2152,60 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) return size << conf->chunk_shift; } -static int run(mddev_t *mddev) + +static conf_t *setup_conf(mddev_t *mddev) { - conf_t *conf; - int i, disk_idx, chunk_size; - mirror_info_t *disk; - mdk_rdev_t *rdev; + conf_t *conf = NULL; int nc, fc, fo; sector_t stride, size; + int err = -EINVAL; if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || !is_power_of_2(mddev->chunk_sectors)) { printk(KERN_ERR "md/raid10: chunk size must be " "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); - return -EINVAL; + goto out; } nc = mddev->layout & 255; fc = (mddev->layout >> 8) & 255; fo = mddev->layout & (1<<16); + if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || (mddev->layout >> 17)) { printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", mdname(mddev), mddev->layout); goto out; } - /* - * copy the already verified devices into our private RAID10 - * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] - */ + + err = -ENOMEM; conf = kzalloc(sizeof(conf_t), GFP_KERNEL); - mddev->private = conf; - if (!conf) { - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", - mdname(mddev)); + if (!conf) goto out; - } + conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, - GFP_KERNEL); - if (!conf->mirrors) { - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", - mdname(mddev)); - goto out_free_conf; - } + GFP_KERNEL); + if (!conf->mirrors) + goto out; conf->tmppage = alloc_page(GFP_KERNEL); if (!conf->tmppage) - goto out_free_conf; + goto out; + conf->raid_disks = mddev->raid_disks; conf->near_copies = nc; conf->far_copies = fc; conf->copies = nc*fc; conf->far_offset = fo; - conf->chunk_mask = mddev->chunk_sectors - 1; - conf->chunk_shift = ffz(~mddev->chunk_sectors); + conf->chunk_mask = mddev->new_chunk_sectors - 1; + conf->chunk_shift = ffz(~mddev->new_chunk_sectors); + + conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, + r10bio_pool_free, conf); + if (!conf->r10bio_pool) + goto out; + size = mddev->dev_sectors >> conf->chunk_shift; sector_div(size, fc); size = size * conf->raid_disks; @@ -2220,7 +2219,8 @@ static int run(mddev_t *mddev) */ stride += conf->raid_disks - 1; sector_div(stride, conf->raid_disks); - mddev->dev_sectors = stride << conf->chunk_shift; + + conf->dev_sectors = stride << conf->chunk_shift; if (fo) stride = 1; @@ -2228,18 +2228,63 @@ static int run(mddev_t *mddev) sector_div(stride, fc); conf->stride = stride << conf->chunk_shift; - conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, - r10bio_pool_free, conf); - if (!conf->r10bio_pool) { - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", - mdname(mddev)); - goto out_free_conf; - } - conf->mddev = mddev; spin_lock_init(&conf->device_lock); + INIT_LIST_HEAD(&conf->retry_list); + + spin_lock_init(&conf->resync_lock); + init_waitqueue_head(&conf->wait_barrier); + + conf->thread = md_register_thread(raid10d, mddev, NULL); + if (!conf->thread) + goto out; + + conf->scale_disks = 0; + conf->mddev = mddev; + return conf; + + out: + printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", + mdname(mddev)); + if (conf) { + if (conf->r10bio_pool) + mempool_destroy(conf->r10bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf); + } + return ERR_PTR(err); +} + +static int run(mddev_t *mddev) +{ + conf_t *conf; + int i, disk_idx, chunk_size; + mirror_info_t *disk; + mdk_rdev_t *rdev; + sector_t size; + + /* + * copy the already verified devices into our private RAID10 + * bookkeeping area. [whatever we allocate in run(), + * should be freed in stop()] + */ + + if (mddev->private == NULL) { + conf = setup_conf(mddev); + if (IS_ERR(conf)) + return PTR_ERR(conf); + mddev->private = conf; + } + conf = mddev->private; + if (!conf) + goto out; + mddev->queue->queue_lock = &conf->device_lock; + mddev->thread = conf->thread; + conf->thread = NULL; + chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); if (conf->raid_disks % conf->near_copies) @@ -2253,6 +2298,11 @@ static int run(mddev_t *mddev) if (disk_idx >= conf->raid_disks || disk_idx < 0) continue; + if (conf->scale_disks) { + disk_idx *= conf->scale_disks; + rdev->raid_disk = disk_idx; + /* MOVE 'rd%d' link !! */ + } disk = conf->mirrors + disk_idx; disk->rdev = rdev; @@ -2270,11 +2320,6 @@ static int run(mddev_t *mddev) disk->head_position = 0; } - INIT_LIST_HEAD(&conf->retry_list); - - spin_lock_init(&conf->resync_lock); - init_waitqueue_head(&conf->wait_barrier); - /* need to check that every block has at least one working mirror */ if (!enough(conf)) { printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", @@ -2296,15 +2341,6 @@ static int run(mddev_t *mddev) } } - - mddev->thread = md_register_thread(raid10d, mddev, NULL); - if (!mddev->thread) { - printk(KERN_ERR - "raid10: couldn't allocate thread for %s\n", - mdname(mddev)); - goto out_free_conf; - } - if (mddev->recovery_cp != MaxSector) printk(KERN_NOTICE "raid10: %s is not clean" " -- starting background reconstruction\n", @@ -2316,8 +2352,10 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); - mddev->resync_max_sectors = raid10_size(mddev, 0, 0); + mddev->dev_sectors = conf->dev_sectors; + size = raid10_size(mddev, 0, 0); + md_set_array_sectors(mddev, size); + mddev->resync_max_sectors = size; mddev->queue->unplug_fn = raid10_unplug; mddev->queue->backing_dev_info.congested_fn = raid10_congested; @@ -2347,6 +2385,7 @@ static int run(mddev_t *mddev) kfree(conf->mirrors); kfree(conf); mddev->private = NULL; + md_unregister_thread(mddev->thread); out: return -EIO; } @@ -2383,6 +2422,58 @@ static void raid10_quiesce(mddev_t *mddev, int state) } } +static void *raid10_takeover_raid0(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + conf_t *conf; + + if (mddev->degraded > 0) { + printk(KERN_ERR "error: degraded raid0!\n"); + return ERR_PTR(-EINVAL); + } + + /* Update slot numbers to obtain + * degraded raid10 with missing mirrors + */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev->raid_disk *= 2; + } + + /* Set new parameters */ + mddev->new_level = 10; + /* new layout: far_copies = 1, near_copies = 2 */ + mddev->new_layout = (1<<8) + 2; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = mddev->raid_disks; + mddev->degraded = mddev->raid_disks; + mddev->raid_disks *= 2; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + conf = setup_conf(mddev); + conf->scale_disks = 2; + return conf; +} + +static void *raid10_takeover(mddev_t *mddev) +{ + struct raid0_private_data *raid0_priv; + + /* raid10 can take over: + * raid0 - providing it has only two drives + */ + if (mddev->level == 0) { + /* for raid0 takeover only one zone is supported */ + raid0_priv = mddev->private; + if (raid0_priv->nr_strip_zones > 1) { + printk(KERN_ERR "md: cannot takeover raid 0 with more than one zone.\n"); + return ERR_PTR(-EINVAL); + } + return raid10_takeover_raid0(mddev); + } + return ERR_PTR(-EINVAL); +} + static struct mdk_personality raid10_personality = { .name = "raid10", @@ -2399,6 +2490,7 @@ static struct mdk_personality raid10_personality = .sync_request = sync_request, .quiesce = raid10_quiesce, .size = raid10_size, + .takeover = raid10_takeover, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 59cd1efb8d304..3824a087e17c6 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -33,9 +33,16 @@ struct r10_private_data_s { * 1 stripe. */ + sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ + int chunk_shift; /* shift from chunks to sectors */ sector_t chunk_mask; + int scale_disks; /* When starting array, multiply + * each ->raid_disk by this. + * Need for raid0->raid10 migration + */ + struct list_head retry_list; /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; @@ -57,6 +64,11 @@ struct r10_private_data_s { mempool_t *r10bio_pool; mempool_t *r10buf_pool; struct page *tmppage; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct mdk_thread_s *thread; }; typedef struct r10_private_data_s conf_t; From b71031076e1169e89bdac1b245ad1488587e4730 Mon Sep 17 00:00:00 2001 From: Maciej Trela Date: Wed, 14 Apr 2010 16:58:16 +1000 Subject: [PATCH 14/45] md: Correctly handle device removal via sysfs Writing "none" to "../md/dev-xx/slot" removes that device from being an active part of the array, but it didn't set ->raid_disk to -1 to record this fact. Signed-off-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7dcc740895507..766be8701281e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2385,6 +2385,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) return err; sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&rdev->mddev->kobj, nm); + rdev->raid_disk = -1; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { From 233fca36bb439eadcad28500b5139fed7c64a0ae Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 14 Apr 2010 17:02:09 +1000 Subject: [PATCH 15/45] md: Relax checks on ->max_disks when external metadata handling is used. When metadata is being managed by user-space, md doesn't know what the maximum number of devices allowed in an array is so ->max_disks is 0. In this case we should allow any (+ve) number of disks. Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 766be8701281e..46bdf4b38be88 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2801,8 +2801,9 @@ static void analyze_sbs(mddev_t * mddev) i = 0; rdev_for_each(rdev, tmp, mddev) { - if (rdev->desc_nr >= mddev->max_disks || - i > mddev->max_disks) { + if (mddev->max_disks && + (rdev->desc_nr >= mddev->max_disks || + i > mddev->max_disks)) { printk(KERN_WARNING "md: %s: %s: only %d devices permitted\n", mdname(mddev), bdevname(rdev->bdev, b), @@ -5406,7 +5407,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) if (mddev->pers->check_reshape == NULL) return -EINVAL; if (raid_disks <= 0 || - raid_disks >= mddev->max_disks) + (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; From 5cac7861b2de95a1f714ebdc652813abd0afcc73 Mon Sep 17 00:00:00 2001 From: Maciej Trela Date: Wed, 14 Apr 2010 17:17:39 +1000 Subject: [PATCH 16/45] md: notify level changes through sysfs. Level changes can be very significant, so make sure to notify them via sysfs. Signed-off-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 46bdf4b38be88..c5a1b0725c9fc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3059,6 +3059,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + sysfs_notify(&mddev->kobj, NULL, "level"); return rv; } From a78d38a1a16c8e009aa512caa71d483757fefc1c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Mar 2010 16:53:49 +1100 Subject: [PATCH 17/45] md: add support for raid5 to raid4 conversion This is unlikely to be wanted, but we may as well provide it for completeness. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bb28fd6b44fee..020143dec180e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5788,6 +5788,18 @@ static void *raid5_takeover(mddev_t *mddev) return ERR_PTR(-EINVAL); } +static void *raid4_takeover(mddev_t *mddev) +{ + /* raid4 can take over raid5 if layout is right. + */ + if (mddev->level == 5 && + mddev->layout == ALGORITHM_PARITY_N) { + mddev->new_layout = 0; + mddev->new_level = 4; + return setup_conf(mddev); + } + return ERR_PTR(-EINVAL); +} static struct mdk_personality raid5_personality; @@ -5903,6 +5915,7 @@ static struct mdk_personality raid4_personality = .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid4_takeover, }; static int __init raid5_init(void) From 2b7f22284d71975e37a82db154386348eec0e52c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Mar 2010 16:06:03 +1100 Subject: [PATCH 18/45] md/raid5: small tidyup in raid5_align_endio Diving through ->queue to find mddev is unnecessarily complex - there is an easier path to finding mddev, so use that. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 020143dec180e..7bfeba3ce1e0a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3713,10 +3713,10 @@ static void raid5_align_endio(struct bio *bi, int error) bio_put(bi); - mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; - conf = mddev->private; rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; + mddev = rdev->mddev; + conf = mddev->private; rdev_dec_pending(rdev, conf->mddev); From 490773268cf64f68da2470e07b52c7944da6312d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Mar 2010 16:20:56 +1100 Subject: [PATCH 19/45] md: move io accounting out of personalities into md_make_request While I generally prefer letting personalities do as much as possible, given that we have a central md_make_request anyway we may as well use it to simplify code. Also this centralises knowledge of ->gendisk which will help later. Signed-off-by: NeilBrown --- drivers/md/linear.c | 8 -------- drivers/md/md.c | 11 +++++++++++ drivers/md/multipath.c | 8 -------- drivers/md/raid0.c | 8 -------- drivers/md/raid1.c | 7 ------- drivers/md/raid10.c | 7 ------- drivers/md/raid5.c | 8 +------- 7 files changed, 12 insertions(+), 45 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 9db8ee0614a41..3048c1704f405 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -288,23 +288,15 @@ static int linear_stop (mddev_t *mddev) static int linear_make_request (struct request_queue *q, struct bio *bio) { - const int rw = bio_data_dir(bio); mddev_t *mddev = q->queuedata; dev_info_t *tmp_dev; sector_t start_sector; - int cpu; if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { md_barrier_request(mddev, bio); return 0; } - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - rcu_read_lock(); tmp_dev = which_dev(mddev, bio->bi_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; diff --git a/drivers/md/md.c b/drivers/md/md.c index c5a1b0725c9fc..117663d2a4e52 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -214,8 +214,11 @@ static DEFINE_SPINLOCK(all_mddevs_lock); */ static int md_make_request(struct request_queue *q, struct bio *bio) { + const int rw = bio_data_dir(bio); mddev_t *mddev = q->queuedata; int rv; + int cpu; + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return 0; @@ -236,7 +239,15 @@ static int md_make_request(struct request_queue *q, struct bio *bio) } atomic_inc(&mddev->active_io); rcu_read_unlock(); + rv = mddev->pers->make_request(q, bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) wake_up(&mddev->sb_wait); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 97befd5cc0e30..5b4e2918663a4 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -141,8 +141,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) multipath_conf_t *conf = mddev->private; struct multipath_bh * mp_bh; struct multipath_info *multipath; - const int rw = bio_data_dir(bio); - int cpu; if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { md_barrier_request(mddev, bio); @@ -154,12 +152,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) mp_bh->master_bio = bio; mp_bh->mddev = mddev; - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - mp_bh->path = multipath_map(conf); if (mp_bh->path < 0) { bio_endio(bio, -EIO); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index afddf624bad30..d535f9be39f4d 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -472,20 +472,12 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) sector_t sector_offset; struct strip_zone *zone; mdk_rdev_t *tmp_dev; - const int rw = bio_data_dir(bio); - int cpu; if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { md_barrier_request(mddev, bio); return 0; } - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - chunk_sects = mddev->chunk_sectors; if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { sector_t sector = bio->bi_sector; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 23a7516abbfd4..e277013ac8082 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -787,7 +787,6 @@ static int make_request(struct request_queue *q, struct bio * bio) struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); - int cpu; bool do_barriers; mdk_rdev_t *blocked_rdev; @@ -833,12 +832,6 @@ static int make_request(struct request_queue *q, struct bio * bio) bitmap = mddev->bitmap; - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - /* * make_request() can abort the operation when READA is being * used and no empty request is available. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 57d71d5d88f45..ca313d646fd14 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -795,7 +795,6 @@ static int make_request(struct request_queue *q, struct bio * bio) mirror_info_t *mirror; r10bio_t *r10_bio; struct bio *read_bio; - int cpu; int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); @@ -850,12 +849,6 @@ static int make_request(struct request_queue *q, struct bio * bio) */ wait_barrier(conf); - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7bfeba3ce1e0a..c6ae7c1949158 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3879,7 +3879,7 @@ static int make_request(struct request_queue *q, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; const int rw = bio_data_dir(bi); - int cpu, remaining; + int remaining; if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { /* Drain all pending writes. We only really need @@ -3894,12 +3894,6 @@ static int make_request(struct request_queue *q, struct bio * bi) md_write_start(mddev, bi); - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bi)); - part_stat_unlock(); - if (rw == READ && mddev->reshape_position == MaxSector && chunk_aligned_read(q,bi)) From 49ce6cea85fb8d25ee59486c919406e9cecf1762 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Mar 2010 10:51:42 +1100 Subject: [PATCH 20/45] md: don't reference gendisk in getgeo Using ->array_sectors rather than get_capacity() is more direct and is a step towards relaxing the tight connection between mddev and gendisk. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 117663d2a4e52..69f2a8e6ccdfa 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5556,7 +5556,7 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) geo->heads = 2; geo->sectors = 4; - geo->cylinders = get_capacity(mddev->gendisk) / 8; + geo->cylinders = mddev->array_sectors / 8; return 0; } From b821eaa572fd737faaf6928ba046e571526c36c6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Mar 2010 11:18:15 +1100 Subject: [PATCH 21/45] md: remove ->changed and related code. We set ->changed to 1 and call check_disk_change at the end of md_open so that bd_invalidated would be set and thus partition rescan would happen appropriately. Now that we call revalidate_disk directly, which sets bd_invalidates, that indirection is no longer needed and can be removed. Signed-off-by: NeilBrown --- drivers/md/md.c | 22 +--------------------- drivers/md/md.h | 1 - drivers/md/raid1.c | 1 - drivers/md/raid5.c | 2 -- 4 files changed, 1 insertion(+), 25 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 69f2a8e6ccdfa..f2b30019b1cb4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4501,7 +4501,6 @@ static int do_md_run(mddev_t * mddev) md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ revalidate_disk(mddev->gendisk); - mddev->changed = 1; md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); if (mddev->sysfs_action) @@ -4620,7 +4619,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) } set_capacity(disk, 0); - mddev->changed = 1; + revalidate_disk(disk); if (mddev->ro) mddev->ro = 0; @@ -4686,7 +4685,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; mddev->in_sync = 0; - mddev->changed = 0; mddev->degraded = 0; mddev->barriers_work = 0; mddev->safemode = 0; @@ -5850,7 +5848,6 @@ static int md_open(struct block_device *bdev, fmode_t mode) atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); - check_disk_change(bdev); out: return err; } @@ -5865,21 +5862,6 @@ static int md_release(struct gendisk *disk, fmode_t mode) return 0; } - -static int md_media_changed(struct gendisk *disk) -{ - mddev_t *mddev = disk->private_data; - - return mddev->changed; -} - -static int md_revalidate(struct gendisk *disk) -{ - mddev_t *mddev = disk->private_data; - - mddev->changed = 0; - return 0; -} static const struct block_device_operations md_fops = { .owner = THIS_MODULE, @@ -5890,8 +5872,6 @@ static const struct block_device_operations md_fops = .compat_ioctl = md_compat_ioctl, #endif .getgeo = md_getgeo, - .media_changed = md_media_changed, - .revalidate_disk= md_revalidate, }; static int md_thread(void * arg) diff --git a/drivers/md/md.h b/drivers/md/md.h index e4836c68b73e0..3225e25f3c2ad 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -237,7 +237,6 @@ struct mddev_s atomic_t active; /* general refcount */ atomic_t openers; /* number of active opens */ - int changed; /* true if we might need to reread partition info */ int degraded; /* whether md should consider * adding a spare */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e277013ac8082..eebce166dafef 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2184,7 +2184,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c6ae7c1949158..231afda1697f1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5335,7 +5335,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) raid5_size(mddev, sectors, mddev->raid_disks)) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->dev_sectors; @@ -5549,7 +5548,6 @@ static void raid5_finish_reshape(mddev_t *mddev) if (mddev->delta_disks > 0) { md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; revalidate_disk(mddev->gendisk); } else { int d; From fe60b0142813002be16dfae28780d9779ee22473 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Mar 2010 11:10:42 +1100 Subject: [PATCH 22/45] md: factor do_md_run to separate accesses to ->gendisk As part of relaxing the binding between an mddev and gendisk, we separate do_md_run into two functions. md_run does all the work internal to md do_md_run calls md_run and makes and changes to gendisk that are required. Signed-off-by: NeilBrown --- drivers/md/md.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f2b30019b1cb4..e752332268d2f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4297,11 +4297,10 @@ static void md_safemode_timeout(unsigned long data) static int start_dirty_degraded; -static int do_md_run(mddev_t * mddev) +static int md_run(mddev_t *mddev) { int err; mdk_rdev_t *rdev; - struct gendisk *disk; struct mdk_personality *pers; if (list_empty(&mddev->disks)) @@ -4366,8 +4365,6 @@ static int do_md_run(mddev_t * mddev) sysfs_notify_dirent(rdev->sysfs_state); } - disk = mddev->gendisk; - spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { @@ -4495,21 +4492,32 @@ static int do_md_run(mddev_t * mddev) if (mddev->flags) md_update_sb(mddev, 0); - set_capacity(disk, mddev->array_sectors); - md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ - revalidate_disk(mddev->gendisk); md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); if (mddev->sysfs_action) sysfs_notify_dirent(mddev->sysfs_action); sysfs_notify(&mddev->kobj, NULL, "degraded"); - kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); return 0; } +static int do_md_run(mddev_t *mddev) +{ + int err; + + err = md_run(mddev); + if (err) + goto out; + + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); +out: + return err; +} + static int restart_array(mddev_t *mddev) { struct gendisk *disk = mddev->gendisk; From 6177b472ab14e1ac88896960370dd54ba577d926 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Mar 2010 11:37:13 +1100 Subject: [PATCH 23/45] md: start to refactor do_md_stop do_md_stop is large and clunky, so hard to understand. This is a first step of refactoring, pulling two simple sub-functions out. Signed-off-by: NeilBrown --- drivers/md/md.c | 98 +++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e752332268d2f..002d0a34d6ea8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4568,6 +4568,58 @@ void restore_bitmap_write_access(struct file *file) spin_unlock(&inode->i_lock); } +static void md_clean(mddev_t *mddev) +{ + mddev->array_sectors = 0; + mddev->external_size = 0; + mddev->dev_sectors = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; + mddev->level = LEVEL_NONE; + mddev->clevel[0] = 0; + mddev->flags = 0; + mddev->ro = 0; + mddev->metadata_type[0] = 0; + mddev->chunk_sectors = 0; + mddev->ctime = mddev->utime = 0; + mddev->layout = 0; + mddev->max_disks = 0; + mddev->events = 0; + mddev->delta_disks = 0; + mddev->new_level = LEVEL_NONE; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + mddev->curr_resync = 0; + mddev->resync_mismatches = 0; + mddev->suspend_lo = mddev->suspend_hi = 0; + mddev->sync_speed_min = mddev->sync_speed_max = 0; + mddev->recovery = 0; + mddev->in_sync = 0; + mddev->degraded = 0; + mddev->barriers_work = 0; + mddev->safemode = 0; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.chunksize = 0; + mddev->bitmap_info.daemon_sleep = 0; + mddev->bitmap_info.max_write_behind = 0; +} + +static void md_stop(mddev_t *mddev) +{ + mddev->pers->stop(mddev); + if (mddev->pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(mddev->pers->owner); + mddev->pers = NULL; + +} + /* mode: * 0 - completely stop and dis-assemble array * 1 - switch to readonly @@ -4608,14 +4660,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->ro) set_disk_ro(disk, 0); - mddev->pers->stop(mddev); + md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; - module_put(mddev->pers->owner); - if (mddev->pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - mddev->pers = NULL; + /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); @@ -4663,44 +4712,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) export_array(mddev); - mddev->array_sectors = 0; - mddev->external_size = 0; - mddev->dev_sectors = 0; - mddev->raid_disks = 0; - mddev->recovery_cp = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - mddev->reshape_position = MaxSector; - mddev->external = 0; - mddev->persistent = 0; - mddev->level = LEVEL_NONE; - mddev->clevel[0] = 0; - mddev->flags = 0; - mddev->ro = 0; - mddev->metadata_type[0] = 0; - mddev->chunk_sectors = 0; - mddev->ctime = mddev->utime = 0; - mddev->layout = 0; - mddev->max_disks = 0; - mddev->events = 0; - mddev->delta_disks = 0; - mddev->new_level = LEVEL_NONE; - mddev->new_layout = 0; - mddev->new_chunk_sectors = 0; - mddev->curr_resync = 0; - mddev->resync_mismatches = 0; - mddev->suspend_lo = mddev->suspend_hi = 0; - mddev->sync_speed_min = mddev->sync_speed_max = 0; - mddev->recovery = 0; - mddev->in_sync = 0; - mddev->degraded = 0; - mddev->barriers_work = 0; - mddev->safemode = 0; - mddev->bitmap_info.offset = 0; - mddev->bitmap_info.default_offset = 0; - mddev->bitmap_info.chunksize = 0; - mddev->bitmap_info.daemon_sleep = 0; - mddev->bitmap_info.max_write_behind = 0; + md_clean(mddev); kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; From a047e125403112ceb4d41e68307a2e7498ddba4e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Mar 2010 12:07:53 +1100 Subject: [PATCH 24/45] md: factor md_stop_writes out of do_md_stop. Further refactoring of do_md_stop. This one requires some explanation as it takes code from different places in do_md_stop, so some re-ordering happens. We only get into this part of do_md_stop if there are no active opens of the device, so no writes can be happening and the device must have been flushed. In md_stop_writes we want to stop any internal sources of writes - i.e. resync - and flush out the metadata. The only code that was previously before some of this code is code to clean up the queue, the mddev, the gendisk, or sysfs, all of which is probably better after code that makes active changes (i.e. triggers writes). Signed-off-by: NeilBrown --- drivers/md/md.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 002d0a34d6ea8..86dfbc361cc03 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4610,6 +4610,27 @@ static void md_clean(mddev_t *mddev) mddev->bitmap_info.max_write_behind = 0; } +static void md_stop_writes(mddev_t *mddev) +{ + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + } + + del_timer_sync(&mddev->safemode_timer); + + bitmap_flush(mddev); + md_super_wait(mddev); + + if (!mddev->in_sync || mddev->flags) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev, 1); + } +} + static void md_stop(mddev_t *mddev) { mddev->pers->stop(mddev); @@ -4637,14 +4658,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) err = -EBUSY; } else if (mddev->pers) { - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - } - - del_timer_sync(&mddev->safemode_timer); + md_stop_writes(mddev); switch(mode) { case 1: /* readonly */ @@ -4655,8 +4669,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) break; case 0: /* disassemble */ case 2: /* stop */ - bitmap_flush(mddev); - md_super_wait(mddev); if (mddev->ro) set_disk_ro(disk, 0); @@ -4681,11 +4693,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->ro) mddev->ro = 0; } - if (!mddev->in_sync || mddev->flags) { - /* mark array as shutdown cleanly */ - mddev->in_sync = 1; - md_update_sb(mddev, 1); - } if (mode == 1) set_disk_ro(disk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); From a4bd82d0d03b1485975579f131ccfd0aad6b7d6e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Mar 2010 13:23:10 +1100 Subject: [PATCH 25/45] md: split md_set_readonly out of do_md_stop Using do_md_stop to set an array to read-only is a little confusing. Now most of the common code has been factored out, split md_set_readonly off in to a separate function. Signed-off-by: NeilBrown --- drivers/md/md.c | 90 ++++++++++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 86dfbc361cc03..3a2710a2e104f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3309,6 +3309,7 @@ array_state_show(mddev_t *mddev, char *page) } static int do_md_stop(mddev_t * mddev, int ro, int is_open); +static int md_set_readonly(mddev_t * mddev, int is_open); static int do_md_run(mddev_t * mddev); static int restart_array(mddev_t *mddev); @@ -3339,7 +3340,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; /* not supported yet */ case readonly: if (mddev->pers) - err = do_md_stop(mddev, 1, 0); + err = md_set_readonly(mddev, 0); else { mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); @@ -3349,7 +3350,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case read_auto: if (mddev->pers) { if (mddev->ro == 0) - err = do_md_stop(mddev, 1, 0); + err = md_set_readonly(mddev, 0); else if (mddev->ro == 1) err = restart_array(mddev); if (err == 0) { @@ -4641,9 +4642,34 @@ static void md_stop(mddev_t *mddev) } +static int md_set_readonly(mddev_t *mddev, int is_open) +{ + int err = 0; + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > is_open) { + printk("md: %s still in use.\n",mdname(mddev)); + err = -EBUSY; + goto out; + } + if (mddev->pers) { + md_stop_writes(mddev); + + err = -ENXIO; + if (mddev->ro==1) + goto out; + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + sysfs_notify_dirent(mddev->sysfs_state); + err = 0; + } +out: + mutex_unlock(&mddev->open_mutex); + return err; +} + /* mode: * 0 - completely stop and dis-assemble array - * 1 - switch to readonly * 2 - stop but do not disassemble array */ static int do_md_stop(mddev_t * mddev, int mode, int is_open) @@ -4660,45 +4686,33 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) md_stop_writes(mddev); - switch(mode) { - case 1: /* readonly */ - err = -ENXIO; - if (mddev->ro==1) - goto out; - mddev->ro = 1; - break; - case 0: /* disassemble */ - case 2: /* stop */ - if (mddev->ro) - set_disk_ro(disk, 0); + if (mddev->ro) + set_disk_ro(disk, 0); - md_stop(mddev); - mddev->queue->merge_bvec_fn = NULL; - mddev->queue->unplug_fn = NULL; - mddev->queue->backing_dev_info.congested_fn = NULL; + md_stop(mddev); + mddev->queue->merge_bvec_fn = NULL; + mddev->queue->unplug_fn = NULL; + mddev->queue->backing_dev_info.congested_fn = NULL; - /* tell userspace to handle 'inactive' */ - sysfs_notify_dirent(mddev->sysfs_state); + /* tell userspace to handle 'inactive' */ + sysfs_notify_dirent(mddev->sysfs_state); - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } - set_capacity(disk, 0); - revalidate_disk(disk); + set_capacity(disk, 0); + revalidate_disk(disk); - if (mddev->ro) - mddev->ro = 0; - } - if (mode == 1) - set_disk_ro(disk, 1); + if (mddev->ro) + mddev->ro = 0; + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = 0; } -out: mutex_unlock(&mddev->open_mutex); if (err) return err; @@ -4724,9 +4738,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; - } else if (mddev->pers) - printk(KERN_INFO "md: %s switched to read-only mode.\n", - mdname(mddev)); + } err = 0; blk_integrity_unregister(disk); md_new_event(mddev); @@ -5724,7 +5736,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto done_unlock; case STOP_ARRAY_RO: - err = do_md_stop(mddev, 1, 1); + err = md_set_readonly(mddev, 1); goto done_unlock; case BLKROSET: @@ -7140,7 +7152,7 @@ static int md_notify_reboot(struct notifier_block *this, * appears to still be in use. Hence * the '100'. */ - do_md_stop(mddev, 1, 100); + md_set_readonly(mddev, 100); mddev_unlock(mddev); } /* From cca9cf90c504d98644ace52c474770970729f0eb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 1 Apr 2010 12:08:16 +1100 Subject: [PATCH 26/45] md: call md_stop_writes from md_stop This moves the call to the other side of set_readonly, but that should not be an issue. This encapsulates in 'md_stop' all of the functionality for internally stopping the array, leaving all the interactions with externalities (sysfs, request_queue, gendisk) in do_md_stop. Signed-off-by: NeilBrown --- drivers/md/md.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3a2710a2e104f..f48ba419cd7b5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4634,12 +4634,14 @@ static void md_stop_writes(mddev_t *mddev) static void md_stop(mddev_t *mddev) { + md_stop_writes(mddev); + mddev->pers->stop(mddev); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; module_put(mddev->pers->owner); mddev->pers = NULL; - + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } static int md_set_readonly(mddev_t *mddev, int is_open) @@ -4684,8 +4686,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) err = -EBUSY; } else if (mddev->pers) { - md_stop_writes(mddev); - if (mddev->ro) set_disk_ro(disk, 0); @@ -4710,7 +4710,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->ro) mddev->ro = 0; - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = 0; } mutex_unlock(&mddev->open_mutex); From 21a52c6d05c15f862797736393915bfa8cd40ee9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 1 Apr 2010 15:02:13 +1100 Subject: [PATCH 27/45] md: pass mddev to make_request functions rather than request_queue We used to pass the personality make_request function direct to the block layer so the first argument had to be a queue. But now we have the intermediary md_make_request so it makes at lot more sense to pass a struct mddev_s. It makes it possible to have an mddev without its own queue too. Signed-off-by: NeilBrown --- drivers/md/faulty.c | 3 +-- drivers/md/linear.c | 7 +++---- drivers/md/md.c | 4 ++-- drivers/md/md.h | 2 +- drivers/md/multipath.c | 3 +-- drivers/md/raid0.c | 7 +++---- drivers/md/raid1.c | 3 +-- drivers/md/raid10.c | 7 +++---- drivers/md/raid5.c | 8 +++----- 9 files changed, 18 insertions(+), 26 deletions(-) diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 608a8d3736e21..bd4348f6be0ba 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -168,9 +168,8 @@ static void add_sector(conf_t *conf, sector_t start, int mode) conf->nfaults = n+1; } -static int make_request(struct request_queue *q, struct bio *bio) +static int make_request(mddev_t *mddev, struct bio *bio) { - mddev_t *mddev = q->queuedata; conf_t *conf = mddev->private; int failit = 0; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 3048c1704f405..3204a2263f216 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -286,9 +286,8 @@ static int linear_stop (mddev_t *mddev) return 0; } -static int linear_make_request (struct request_queue *q, struct bio *bio) +static int linear_make_request (mddev_t *mddev, struct bio *bio) { - mddev_t *mddev = q->queuedata; dev_info_t *tmp_dev; sector_t start_sector; @@ -328,9 +327,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) bp = bio_split(bio, end_sector - bio->bi_sector); - if (linear_make_request(q, &bp->bio1)) + if (linear_make_request(mddev, &bp->bio1)) generic_make_request(&bp->bio1); - if (linear_make_request(q, &bp->bio2)) + if (linear_make_request(mddev, &bp->bio2)) generic_make_request(&bp->bio2); bio_pair_release(bp); return 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index f48ba419cd7b5..2e05b0c2515d2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -240,7 +240,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio) atomic_inc(&mddev->active_io); rcu_read_unlock(); - rv = mddev->pers->make_request(q, bio); + rv = mddev->pers->make_request(mddev, bio); cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); @@ -354,7 +354,7 @@ static void md_submit_barrier(struct work_struct *ws) bio_endio(bio, 0); else { bio->bi_rw &= ~(1<pers->make_request(mddev->queue, bio)) + if (mddev->pers->make_request(mddev, bio)) generic_make_request(bio); mddev->barrier = POST_REQUEST_BARRIER; submit_barriers(mddev); diff --git a/drivers/md/md.h b/drivers/md/md.h index 3225e25f3c2ad..a536f54580970 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -330,7 +330,7 @@ struct mdk_personality int level; struct list_head list; struct module *owner; - int (*make_request)(struct request_queue *q, struct bio *bio); + int (*make_request)(mddev_t *mddev, struct bio *bio); int (*run)(mddev_t *mddev); int (*stop)(mddev_t *mddev); void (*status)(struct seq_file *seq, mddev_t *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 5b4e2918663a4..50bf8e6f8c7b4 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -135,9 +135,8 @@ static void multipath_unplug(struct request_queue *q) } -static int multipath_make_request (struct request_queue *q, struct bio * bio) +static int multipath_make_request(mddev_t *mddev, struct bio * bio) { - mddev_t *mddev = q->queuedata; multipath_conf_t *conf = mddev->private; struct multipath_bh * mp_bh; struct multipath_info *multipath; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d535f9be39f4d..9f9c6b76ca7c3 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -465,9 +465,8 @@ static inline int is_io_in_chunk_boundary(mddev_t *mddev, } } -static int raid0_make_request(struct request_queue *q, struct bio *bio) +static int raid0_make_request(mddev_t *mddev, struct bio *bio) { - mddev_t *mddev = q->queuedata; unsigned int chunk_sects; sector_t sector_offset; struct strip_zone *zone; @@ -495,9 +494,9 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) else bp = bio_split(bio, chunk_sects - sector_div(sector, chunk_sects)); - if (raid0_make_request(q, &bp->bio1)) + if (raid0_make_request(mddev, &bp->bio1)) generic_make_request(&bp->bio1); - if (raid0_make_request(q, &bp->bio2)) + if (raid0_make_request(mddev, &bp->bio2)) generic_make_request(&bp->bio2); bio_pair_release(bp); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index eebce166dafef..5ff75c4d3af6f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -773,9 +773,8 @@ static struct page **alloc_behind_pages(struct bio *bio) return NULL; } -static int make_request(struct request_queue *q, struct bio * bio) +static int make_request(mddev_t *mddev, struct bio * bio) { - mddev_t *mddev = q->queuedata; conf_t *conf = mddev->private; mirror_info_t *mirror; r1bio_t *r1_bio; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ca313d646fd14..a1d727610a491 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -788,9 +788,8 @@ static void unfreeze_array(conf_t *conf) spin_unlock_irq(&conf->resync_lock); } -static int make_request(struct request_queue *q, struct bio * bio) +static int make_request(mddev_t *mddev, struct bio * bio) { - mddev_t *mddev = q->queuedata; conf_t *conf = mddev->private; mirror_info_t *mirror; r10bio_t *r10_bio; @@ -824,9 +823,9 @@ static int make_request(struct request_queue *q, struct bio * bio) */ bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); - if (make_request(q, &bp->bio1)) + if (make_request(mddev, &bp->bio1)) generic_make_request(&bp->bio1); - if (make_request(q, &bp->bio2)) + if (make_request(mddev, &bp->bio2)) generic_make_request(&bp->bio2); bio_pair_release(bp); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 231afda1697f1..2882a26646fdc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3753,9 +3753,8 @@ static int bio_fits_rdev(struct bio *bi) } -static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) +static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) { - mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev->private; int dd_idx; struct bio* align_bi; @@ -3870,9 +3869,8 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) return sh; } -static int make_request(struct request_queue *q, struct bio * bi) +static int make_request(mddev_t *mddev, struct bio * bi) { - mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev->private; int dd_idx; sector_t new_sector; @@ -3896,7 +3894,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if (rw == READ && mddev->reshape_position == MaxSector && - chunk_aligned_read(q,bi)) + chunk_aligned_read(mddev,bi)) return 0; logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); From fafd7fb052182e087b5a3c6c408e4ac8c2b5fa14 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 1 Apr 2010 15:55:30 +1100 Subject: [PATCH 28/45] md: factor out init code for an mddev This is a simple factorisation that makes mddev_find easier to read. Signed-off-by: NeilBrown --- drivers/md/md.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2e05b0c2515d2..d3579fc9efedd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -416,6 +416,27 @@ static void mddev_put(mddev_t *mddev) spin_unlock(&all_mddevs_lock); } +static void mddev_init(mddev_t *mddev) +{ + mutex_init(&mddev->open_mutex); + mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->bitmap_info.mutex); + INIT_LIST_HEAD(&mddev->disks); + INIT_LIST_HEAD(&mddev->all_mddevs); + init_timer(&mddev->safemode_timer); + atomic_set(&mddev->active, 1); + atomic_set(&mddev->openers, 0); + atomic_set(&mddev->active_io, 0); + spin_lock_init(&mddev->write_lock); + atomic_set(&mddev->flush_pending, 0); + init_waitqueue_head(&mddev->sb_wait); + init_waitqueue_head(&mddev->recovery_wait); + mddev->reshape_position = MaxSector; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->level = LEVEL_NONE; +} + static mddev_t * mddev_find(dev_t unit) { mddev_t *mddev, *new = NULL; @@ -482,23 +503,7 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; - mutex_init(&new->open_mutex); - mutex_init(&new->reconfig_mutex); - mutex_init(&new->bitmap_info.mutex); - INIT_LIST_HEAD(&new->disks); - INIT_LIST_HEAD(&new->all_mddevs); - init_timer(&new->safemode_timer); - atomic_set(&new->active, 1); - atomic_set(&new->openers, 0); - atomic_set(&new->active_io, 0); - spin_lock_init(&new->write_lock); - atomic_set(&new->flush_pending, 0); - init_waitqueue_head(&new->sb_wait); - init_waitqueue_head(&new->recovery_wait); - new->reshape_position = MaxSector; - new->resync_min = 0; - new->resync_max = MaxSector; - new->level = LEVEL_NONE; + mddev_init(new); goto retry; } From 9e35b99c7efacfddc748c89a0c53b1122b0ee72c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 6 Apr 2010 14:23:02 +1000 Subject: [PATCH 29/45] md: don't unregister the thread in mddev_suspend This is - unnecessary because mddev_suspend is always followed by a call to ->stop, and each ->stop unregisters the thread, and - a problem as it makes it awkwards to suspend and then resume a device as we will want later. Signed-off-by: NeilBrown --- drivers/md/md.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d3579fc9efedd..af0780ae56b57 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -254,6 +254,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) return rv; } +/* mddev_suspend makes sure no new requests are submitted + * to the device, and that any requests that have been submitted + * are completely handled. + * Once ->stop is called and completes, the module will be completely + * unused. + */ static void mddev_suspend(mddev_t *mddev) { BUG_ON(mddev->suspended); @@ -261,13 +267,6 @@ static void mddev_suspend(mddev_t *mddev) synchronize_rcu(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); - md_unregister_thread(mddev->thread); - mddev->thread = NULL; - /* we now know that no code is executing in the personality module, - * except possibly the tail end of a ->bi_end_io function, but that - * is certain to complete before the module has a chance to get - * unloaded - */ } static void mddev_resume(mddev_t *mddev) From d754c5ae1ff76b20d3ecde8ad666d7865eada8ae Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 7 Apr 2010 12:14:43 +1000 Subject: [PATCH 30/45] md/raid1: fix confusing 'redirect sector' message. This message seems to suggest the named device is the one on which a read failed, however it is actually the device that the read will be redirected to. So make the message a little clearer. Reported-by: Tim Burgess Signed-off-by: NeilBrown --- drivers/md/raid1.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5ff75c4d3af6f..2e08e48b02d9a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1689,10 +1689,10 @@ static void raid1d(mddev_t *mddev) r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; if (printk_ratelimit()) - printk(KERN_ERR "raid1: %s: redirecting sector %llu to" - " another mirror\n", - bdevname(rdev->bdev,b), - (unsigned long long)r1_bio->sector); + printk(KERN_ERR "raid1: redirecting sector %llu to" + " other mirror: %s\n", + (unsigned long long)r1_bio->sector, + bdevname(rdev->bdev,b)); bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; From e555190d82c0f58e825e3cbd9e6ebe2e7ac713bd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 31 Mar 2010 11:21:44 +1100 Subject: [PATCH 31/45] md/raid1: delay reads that could overtake behind-writes. When a raid1 array is configured to support write-behind on some devices, it normally only reads from other devices. If all devices are write-behind (because the rest have failed) it is possible for a read request to be serviced before a behind-write request, which would appear as data corruption. So when forced to read from a WriteMostly device, wait for any write-behind to complete, and don't start any more behind-writes. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 4 +++- drivers/md/bitmap.h | 1 + drivers/md/raid1.c | 25 ++++++++++++++++++------- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 49d6080387c88..c9c6a345e17bb 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1356,7 +1356,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto { if (!bitmap) return; if (behind) { - atomic_dec(&bitmap->behind_writes); + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); } @@ -1680,6 +1681,7 @@ int bitmap_create(mddev_t *mddev) atomic_set(&bitmap->pending_writes, 0); init_waitqueue_head(&bitmap->write_wait); init_waitqueue_head(&bitmap->overflow_wait); + init_waitqueue_head(&bitmap->behind_wait); bitmap->mddev = mddev; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index aa82b7caa85f0..3797dea4723a3 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -240,6 +240,7 @@ struct bitmap { atomic_t pending_writes; /* pending writes to the bitmap file */ wait_queue_head_t write_wait; wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; struct sysfs_dirent *sysfs_can_clear; }; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2e08e48b02d9a..cb2da87ad5932 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -857,6 +857,15 @@ static int make_request(mddev_t *mddev, struct bio * bio) } mirror = conf->mirrors + rdisk; + if (test_bit(WriteMostly, &mirror->rdev->flags) && + bitmap) { + /* Reading from a write-mostly device must + * take care not to over-take any writes + * that are 'behind' + */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } r1_bio->read_disk = rdisk; read_bio = bio_clone(bio, GFP_NOIO); @@ -934,10 +943,14 @@ static int make_request(mddev_t *mddev, struct bio * bio) set_bit(R1BIO_Degraded, &r1_bio->state); } - /* do behind I/O ? */ + /* do behind I/O ? + * Not if there are too many, or cannot allocate memory, + * or a reader on WriteMostly is waiting for behind writes + * to flush */ if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && + !waitqueue_active(&bitmap->behind_wait) && (behind_pages = alloc_behind_pages(bio)) != NULL) set_bit(R1BIO_BehindIO, &r1_bio->state); @@ -2144,15 +2157,13 @@ static int stop(mddev_t *mddev) { conf_t *conf = mddev->private; struct bitmap *bitmap = mddev->bitmap; - int behind_wait = 0; /* wait for behind writes to complete */ - while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - behind_wait++; - printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); /* wait a second */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop.\n", mdname(mddev)); /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); } raise_barrier(conf); From f1b29bcae116409db5e543622aadab43041c9ae9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 1 May 2010 18:09:05 -0700 Subject: [PATCH 32/45] md/raid4: permit raid0 takeover For consistency allow raid4 to takeover raid0 in addition to raid5 (with a raid4 layout). Signed-off-by: Dan Williams --- drivers/md/raid5.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2882a26646fdc..81563b7c03571 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5610,10 +5610,17 @@ static void raid5_quiesce(mddev_t *mddev, int state) } -static void *raid5_takeover_raid0(mddev_t *mddev) +static void *raid45_takeover_raid0(mddev_t *mddev, int level) { + struct raid0_private_data *raid0_priv = mddev->private; - mddev->new_level = 5; + /* for raid0 takeover only one zone is supported */ + if (raid0_priv->nr_strip_zones > 1) { + printk(KERN_ERR "md: cannot takeover raid0 with more than one zone.\n"); + return ERR_PTR(-EINVAL); + } + + mddev->new_level = level; mddev->new_layout = ALGORITHM_PARITY_N; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks += 1; @@ -5749,22 +5756,13 @@ static int raid6_check_reshape(mddev_t *mddev) static void *raid5_takeover(mddev_t *mddev) { /* raid5 can take over: - * raid0 - if all devices are the same - make it a raid4 layout + * raid0 - if there is only one strip zone - make it a raid4 layout * raid1 - if there are two drives. We need to know the chunk size * raid4 - trivial - just use a raid4 layout. * raid6 - Providing it is a *_6 layout */ - if (mddev->level == 0) { - /* for raid0 takeover only one zone is supported */ - struct raid0_private_data *raid0_priv - = mddev->private; - if (raid0_priv->nr_strip_zones > 1) { - printk(KERN_ERR "md: cannot takeover raid 0 with more than one zone.\n"); - return ERR_PTR(-EINVAL); - } - return raid5_takeover_raid0(mddev); - } - + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 5); if (mddev->level == 1) return raid5_takeover_raid1(mddev); if (mddev->level == 4) { @@ -5780,8 +5778,12 @@ static void *raid5_takeover(mddev_t *mddev) static void *raid4_takeover(mddev_t *mddev) { - /* raid4 can take over raid5 if layout is right. + /* raid4 can take over: + * raid0 - if there is only one strip zone + * raid5 - if layout is right */ + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 4); if (mddev->level == 5 && mddev->layout == ALGORITHM_PARITY_N) { mddev->new_layout = 0; From bb7f8d2217d8753ab5008c78f16697d9e697d570 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 1 May 2010 18:14:57 -0700 Subject: [PATCH 33/45] md: notify mdstat waiters of level change Level modifications change the output of mdstat. The mdmon manager thread is interested in these events for external metadata management. Signed-off-by: Dan Williams --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index af0780ae56b57..69f659e46aa62 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3075,6 +3075,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify(&mddev->kobj, NULL, "level"); + md_new_event(mddev); return rv; } From f2859af6716ce99cac7f35c5a0c6b7fed346312f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 2 May 2010 10:04:16 -0700 Subject: [PATCH 34/45] md: allow integers to be passed to md/level e.g. allow md to interpret 'echo 4 > md/level' as a request for raid4. Signed-off-by: Dan Williams --- drivers/md/md.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 69f659e46aa62..b8a0fcfb1de13 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2934,9 +2934,10 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { - char level[16]; + char clevel[16]; ssize_t rv = len; struct mdk_personality *pers; + long level; void *priv; mdk_rdev_t *rdev; @@ -2969,19 +2970,22 @@ level_store(mddev_t *mddev, const char *buf, size_t len) } /* Now find the new personality */ - if (len == 0 || len >= sizeof(level)) + if (len == 0 || len >= sizeof(clevel)) return -EINVAL; - strncpy(level, buf, len); - if (level[len-1] == '\n') + strncpy(clevel, buf, len); + if (clevel[len-1] == '\n') len--; - level[len] = 0; + clevel[len] = 0; + if (strict_strtol(clevel, 10, &level)) + level = LEVEL_NONE; - request_module("md-%s", level); + if (request_module("md-%s", clevel) != 0) + request_module("md-level-%s", clevel); spin_lock(&pers_lock); - pers = find_pers(LEVEL_NONE, level); + pers = find_pers(level, clevel); if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); - printk(KERN_WARNING "md: personality %s not loaded\n", level); + printk(KERN_WARNING "md: personality %s not loaded\n", clevel); return -EINVAL; } spin_unlock(&pers_lock); @@ -2994,7 +2998,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (!pers->takeover) { module_put(pers->owner); printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", - mdname(mddev), level); + mdname(mddev), clevel); return -EINVAL; } @@ -3010,7 +3014,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->delta_disks = 0; module_put(pers->owner); printk(KERN_WARNING "md: %s: %s would not accept array\n", - mdname(mddev), level); + mdname(mddev), clevel); return PTR_ERR(priv); } From 08fb730ca346ff16598ef31911c88fbca6133bf5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 May 2010 13:16:56 +1000 Subject: [PATCH 35/45] md: remove EXPERIMENTAL designation from RAID10 RAID10 has been available for quite a while now and is quite well tested, so we can remove the EXPERIMENTAL designation. Reported-by: Eric MSP Veith Signed-off-by: NeilBrown --- drivers/md/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index acb3a4e404ff2..4a6feac8c94a1 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -100,8 +100,8 @@ config MD_RAID1 If unsure, say Y. config MD_RAID10 - tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" - depends on BLK_DEV_MD && EXPERIMENTAL + tristate "RAID-10 (mirrored striping) mode" + depends on BLK_DEV_MD ---help--- RAID-10 provides a combination of striping (RAID-0) and mirroring (RAID-1) with easier configuration and more flexible From 0c55e02259115c151e4835dd417cf41467bb02e2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 May 2010 14:09:02 +1000 Subject: [PATCH 36/45] md/raid5: improve consistency of error messages. Many 'printk' messages from the raid456 module mention 'raid5' even though it may be a 'raid6' or even 'raid4' array. This can cause confusion. Also the actual array name is not always reported and when it is it is not reported consistently. So change all the messages to start: md/raid:%s: where '%s' becomes e.g. md3 to identify the particular array. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 149 +++++++++++++++++++++------------------------ 1 file changed, 69 insertions(+), 80 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81563b7c03571..cee9f93b35c4c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1509,7 +1509,7 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk_rl(KERN_INFO "raid5:%s: read error corrected" + printk_rl(KERN_INFO "md/raid:%s: read error corrected" " (%lu sectors at %llu on %s)\n", mdname(conf->mddev), STRIPE_SECTORS, (unsigned long long)(sh->sector @@ -1529,7 +1529,7 @@ static void raid5_end_read_request(struct bio * bi, int error) atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) printk_rl(KERN_WARNING - "raid5:%s: read error not correctable " + "md/raid:%s: read error not correctable " "(sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)(sh->sector @@ -1538,7 +1538,7 @@ static void raid5_end_read_request(struct bio * bi, int error) else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ printk_rl(KERN_WARNING - "raid5:%s: read error NOT corrected!! " + "md/raid:%s: read error NOT corrected!! " "(sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)(sh->sector @@ -1547,7 +1547,7 @@ static void raid5_end_read_request(struct bio * bi, int error) else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING - "raid5:%s: Too many read errors, failing device %s.\n", + "md/raid:%s: Too many read errors, failing device %s.\n", mdname(conf->mddev), bdn); else retry = 1; @@ -1620,7 +1620,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; raid5_conf_t *conf = mddev->private; - pr_debug("raid5: error called\n"); + pr_debug("raid456: error called\n"); if (!test_bit(Faulty, &rdev->flags)) { set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1636,9 +1636,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); printk(KERN_ALERT - "raid5: Disk failure on %s, disabling device.\n" - "raid5: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + "md/raid:%s: Disk failure on %s, disabling device.\n" + KERN_ALERT + "md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), + bdevname(rdev->bdev, b), + mdname(mddev), + conf->raid_disks - mddev->degraded); } } @@ -1719,8 +1723,6 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = data_disks; break; default: - printk(KERN_ERR "raid5: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1836,10 +1838,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, qd_idx = raid_disks - 1; break; - default: - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1902,8 +1901,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) case ALGORITHM_PARITY_N: break; default: - printk(KERN_ERR "raid5: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1962,8 +1959,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) i -= 1; break; default: - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1976,7 +1971,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) previous, &dummy1, &sh2); if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx || sh2.qd_idx != sh->qd_idx) { - printk(KERN_ERR "compute_blocknr: map not correct\n"); + printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", + mdname(conf->mddev)); return 0; } return r_sector; @@ -3942,7 +3938,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) new_sector = raid5_compute_sector(conf, logical_sector, previous, &dd_idx, NULL); - pr_debug("raid5: make_request, sector %llu logical %llu\n", + pr_debug("raid456: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -4721,7 +4717,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (mddev->new_level != 5 && mddev->new_level != 4 && mddev->new_level != 6) { - printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", + printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", mdname(mddev), mddev->new_level); return ERR_PTR(-EIO); } @@ -4729,12 +4725,12 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) && !algorithm_valid_raid5(mddev->new_layout)) || (mddev->new_level == 6 && !algorithm_valid_raid6(mddev->new_layout))) { - printk(KERN_ERR "raid5: %s: layout %d not supported\n", + printk(KERN_ERR "md/raid:%s: layout %d not supported\n", mdname(mddev), mddev->new_layout); return ERR_PTR(-EIO); } if (mddev->new_level == 6 && mddev->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", + printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", mdname(mddev), mddev->raid_disks); return ERR_PTR(-EINVAL); } @@ -4742,8 +4738,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (!mddev->new_chunk_sectors || (mddev->new_chunk_sectors << 9) % PAGE_SIZE || !is_power_of_2(mddev->new_chunk_sectors)) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - mddev->new_chunk_sectors << 9, mdname(mddev)); + printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", + mdname(mddev), mddev->new_chunk_sectors << 9); return ERR_PTR(-EINVAL); } @@ -4785,7 +4781,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (raid5_alloc_percpu(conf) != 0) goto abort; - pr_debug("raid5: run(%s) called.\n", mdname(mddev)); + pr_debug("raid456: run(%s) called.\n", mdname(mddev)); list_for_each_entry(rdev, &mddev->disks, same_set) { raid_disk = rdev->raid_disk; @@ -4798,9 +4794,9 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (test_bit(In_sync, &rdev->flags)) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "raid5: device %s operational as raid" - " disk %d\n", bdevname(rdev->bdev,b), - raid_disk); + printk(KERN_INFO "md/raid:%s: device %s operational as raid" + " disk %d\n", + mdname(mddev), bdevname(rdev->bdev, b), raid_disk); } else /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; @@ -4824,16 +4820,17 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; if (grow_stripes(conf, conf->max_nr_stripes)) { printk(KERN_ERR - "raid5: couldn't allocate %dkB for buffers\n", memory); + "md/raid:%s: couldn't allocate %dkB for buffers\n", + mdname(mddev), memory); goto abort; } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", - memory, mdname(mddev)); + printk(KERN_INFO "md/raid:%s: allocated %dkB\n", + mdname(mddev), memory); conf->thread = md_register_thread(raid5d, mddev, NULL); if (!conf->thread) { printk(KERN_ERR - "raid5: couldn't allocate thread for %s\n", + "md/raid:%s: couldn't allocate thread.\n", mdname(mddev)); goto abort; } @@ -4884,7 +4881,7 @@ static int run(mddev_t *mddev) sector_t reshape_offset = 0; if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "raid5: %s is not clean" + printk(KERN_NOTICE "md/raid:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); if (mddev->reshape_position != MaxSector) { @@ -4898,7 +4895,7 @@ static int run(mddev_t *mddev) int max_degraded = (mddev->level == 6 ? 2 : 1); if (mddev->new_level != mddev->level) { - printk(KERN_ERR "raid5: %s: unsupported reshape " + printk(KERN_ERR "md/raid:%s: unsupported reshape " "required - aborting.\n", mdname(mddev)); return -EINVAL; @@ -4911,8 +4908,8 @@ static int run(mddev_t *mddev) here_new = mddev->reshape_position; if (sector_div(here_new, mddev->new_chunk_sectors * (mddev->raid_disks - max_degraded))) { - printk(KERN_ERR "raid5: reshape_position not " - "on a stripe boundary\n"); + printk(KERN_ERR "md/raid:%s: reshape_position not " + "on a stripe boundary\n", mdname(mddev)); return -EINVAL; } reshape_offset = here_new * mddev->new_chunk_sectors; @@ -4933,8 +4930,9 @@ static int run(mddev_t *mddev) if ((here_new * mddev->new_chunk_sectors != here_old * mddev->chunk_sectors) || mddev->ro == 0) { - printk(KERN_ERR "raid5: in-place reshape must be started" - " in read-only mode - aborting\n"); + printk(KERN_ERR "md/raid:%s: in-place reshape must be started" + " in read-only mode - aborting\n", + mdname(mddev)); return -EINVAL; } } else if (mddev->delta_disks < 0 @@ -4943,11 +4941,13 @@ static int run(mddev_t *mddev) : (here_new * mddev->new_chunk_sectors >= here_old * mddev->chunk_sectors)) { /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for " - "auto-recovery - aborting.\n"); + printk(KERN_ERR "md/raid:%s: reshape_position too early for " + "auto-recovery - aborting.\n", + mdname(mddev)); return -EINVAL; } - printk(KERN_INFO "raid5: reshape will continue\n"); + printk(KERN_INFO "md/raid:%s: reshape will continue\n", + mdname(mddev)); /* OK, we should be able to continue; */ } else { BUG_ON(mddev->level != mddev->new_level); @@ -4989,18 +4989,6 @@ static int run(mddev_t *mddev) mddev->minor_version > 90) rdev->recovery_offset = reshape_offset; - printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n", - rdev->raid_disk, working_disks, conf->prev_algo, - conf->previous_raid_disks, conf->max_degraded, - conf->algorithm, conf->raid_disks, - only_parity(rdev->raid_disk, - conf->prev_algo, - conf->previous_raid_disks, - conf->max_degraded), - only_parity(rdev->raid_disk, - conf->algorithm, - conf->raid_disks, - conf->max_degraded)); if (rdev->recovery_offset < reshape_offset) { /* We need to check old and new layout */ if (!only_parity(rdev->raid_disk, @@ -5021,7 +5009,7 @@ static int run(mddev_t *mddev) - working_disks); if (mddev->degraded > conf->max_degraded) { - printk(KERN_ERR "raid5: not enough operational devices for %s" + printk(KERN_ERR "md/raid:%s: not enough operational devices" " (%d/%d failed)\n", mdname(mddev), mddev->degraded, conf->raid_disks); goto abort; @@ -5035,32 +5023,32 @@ static int run(mddev_t *mddev) mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) printk(KERN_WARNING - "raid5: starting dirty degraded array: %s" - "- data corruption possible.\n", + "md/raid:%s: starting dirty degraded array" + " - data corruption possible.\n", mdname(mddev)); else { printk(KERN_ERR - "raid5: cannot start dirty degraded array for %s\n", + "md/raid:%s: cannot start dirty degraded array.\n", mdname(mddev)); goto abort; } } if (mddev->degraded == 0) - printk("raid5: raid level %d set %s active with %d out of %d" - " devices, algorithm %d\n", conf->level, mdname(mddev), + printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" + " devices, algorithm %d\n", mdname(mddev), conf->level, mddev->raid_disks-mddev->degraded, mddev->raid_disks, mddev->new_layout); else - printk(KERN_ALERT "raid5: raid level %d set %s active with %d" - " out of %d devices, algorithm %d\n", conf->level, - mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks, mddev->new_layout); + printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" + " out of %d devices, algorithm %d\n", + mdname(mddev), conf->level, + mddev->raid_disks - mddev->degraded, + mddev->raid_disks, mddev->new_layout); print_raid5_conf(conf); if (conf->reshape_progress != MaxSector) { - printk("...ok start reshape thread\n"); conf->reshape_safe = conf->reshape_progress; atomic_set(&conf->reshape_stripes, 0); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -5087,7 +5075,7 @@ static int run(mddev_t *mddev) mddev->to_remove = NULL; else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) printk(KERN_WARNING - "raid5: failed to create sysfs attributes for %s\n", + "md/raid:%s: failed to create sysfs attributes.\n", mdname(mddev)); mddev->queue->queue_lock = &conf->device_lock; @@ -5117,12 +5105,10 @@ static int run(mddev_t *mddev) free_conf(conf); } mddev->private = NULL; - printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); + printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); return -EIO; } - - static int stop(mddev_t *mddev) { raid5_conf_t *conf = mddev->private; @@ -5196,21 +5182,22 @@ static void print_raid5_conf (raid5_conf_t *conf) int i; struct disk_info *tmp; - printk("RAID5 conf printout:\n"); + printk(KERN_DEBUG "RAID conf printout:\n"); if (!conf) { printk("(conf==NULL)\n"); return; } - printk(" --- rd:%d wd:%d\n", conf->raid_disks, - conf->raid_disks - conf->mddev->degraded); + printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, + conf->raid_disks, + conf->raid_disks - conf->mddev->degraded); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->disks + i; if (tmp->rdev) - printk(" disk %d, o:%d, dev:%s\n", - i, !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", + i, !test_bit(Faulty, &tmp->rdev->flags), + bdevname(tmp->rdev->bdev, b)); } } @@ -5358,7 +5345,8 @@ static int check_stripe_cache(mddev_t *mddev) > conf->max_nr_stripes || ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { - printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", + printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", + mdname(mddev), ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) / STRIPE_SIZE)*4); return 0; @@ -5429,7 +5417,7 @@ static int raid5_start_reshape(mddev_t *mddev) */ if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) < mddev->array_sectors) { - printk(KERN_ERR "md: %s: array size must be reduced " + printk(KERN_ERR "md/raid:%s: array size must be reduced " "before number of disks\n", mdname(mddev)); return -EINVAL; } @@ -5467,9 +5455,9 @@ static int raid5_start_reshape(mddev_t *mddev) if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) printk(KERN_WARNING - "raid5: failed to create " - " link %s for %s\n", - nm, mdname(mddev)); + "md/raid:%s: failed to create " + " link %s\n", + mdname(mddev), nm); } else break; } @@ -5616,7 +5604,8 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level) /* for raid0 takeover only one zone is supported */ if (raid0_priv->nr_strip_zones > 1) { - printk(KERN_ERR "md: cannot takeover raid0 with more than one zone.\n"); + printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } From 9dd1e2faf72f79a2af9dcbd059473c06648726c2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 May 2010 14:30:35 +1000 Subject: [PATCH 37/45] md/raid1: improve printk messages Make sure the array name is included in a uniform way in all printk messages. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 57 +++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cb2da87ad5932..1db02c4955a9c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -296,7 +296,8 @@ static void raid1_end_read_request(struct bio *bio, int error) */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) - printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", + printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); reschedule_retry(r1_bio); } @@ -1075,21 +1076,22 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" - "raid1: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" + KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) { int i; - printk("RAID1 conf printout:\n"); + printk(KERN_DEBUG "RAID1 conf printout:\n"); if (!conf) { - printk("(!conf)\n"); + printk(KERN_DEBUG "(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); rcu_read_lock(); @@ -1097,7 +1099,7 @@ static void print_conf(conf_t *conf) char b[BDEVNAME_SIZE]; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev) - printk(" disk %d, wo:%d, o:%d, dev:%s\n", + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), bdevname(rdev->bdev,b)); @@ -1458,9 +1460,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) char b[BDEVNAME_SIZE]; /* Cannot read from anywhere, array is toast */ md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" " for block %llu\n", - bdevname(bio->bi_bdev,b), + mdname(mddev), + bdevname(bio->bi_bdev, b), (unsigned long long)r1_bio->sector); md_done_sync(mddev, r1_bio->sectors, 0); put_buf(r1_bio); @@ -1582,7 +1585,7 @@ static void fix_read_error(conf_t *conf, int read_disk, else { atomic_add(s, &rdev->corrected_errors); printk(KERN_INFO - "raid1:%s: read error corrected " + "md/raid1:%s: read error corrected " "(%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect + @@ -1687,8 +1690,9 @@ static void raid1d(mddev_t *mddev) bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { - printk(KERN_ALERT "raid1: %s: unrecoverable I/O" + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" " read error for block %llu\n", + mdname(mddev), bdevname(bio->bi_bdev,b), (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); @@ -1702,8 +1706,9 @@ static void raid1d(mddev_t *mddev) r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; if (printk_ratelimit()) - printk(KERN_ERR "raid1: redirecting sector %llu to" + printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" " other mirror: %s\n", + mdname(mddev), (unsigned long long)r1_bio->sector, bdevname(rdev->bdev,b)); bio->bi_sector = r1_bio->sector + rdev->data_offset; @@ -1760,13 +1765,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int still_degraded = 0; if (!conf->r1buf_pool) - { -/* - printk("sync start - bitmap %p\n", mddev->bitmap); -*/ if (init_resync(conf)) return 0; - } max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { @@ -2047,7 +2047,7 @@ static conf_t *setup_conf(mddev_t *mddev) err = -EIO; if (conf->last_used < 0) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", + printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", mdname(mddev)); goto abort; } @@ -2055,7 +2055,7 @@ static conf_t *setup_conf(mddev_t *mddev) conf->thread = md_register_thread(raid1d, mddev, NULL); if (!conf->thread) { printk(KERN_ERR - "raid1: couldn't allocate thread for %s\n", + "md/raid1:%s: couldn't allocate thread\n", mdname(mddev)); goto abort; } @@ -2081,12 +2081,12 @@ static int run(mddev_t *mddev) mdk_rdev_t *rdev; if (mddev->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", + printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", mdname(mddev), mddev->level); return -EIO; } if (mddev->reshape_position != MaxSector) { - printk("raid1: %s: reshape_position set but not supported\n", + printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", mdname(mddev)); return -EIO; } @@ -2129,11 +2129,11 @@ static int run(mddev_t *mddev) mddev->recovery_cp = MaxSector; if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "raid1: %s is not clean" + printk(KERN_NOTICE "md/raid1:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); printk(KERN_INFO - "raid1: raid set %s active with %d out of %d mirrors\n", + "md/raid1:%s: active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, mddev->raid_disks); @@ -2160,7 +2160,8 @@ static int stop(mddev_t *mddev) /* wait for behind writes to complete */ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop.\n", mdname(mddev)); + printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); /* need to kick something here to make sure I/O goes? */ wait_event(bitmap->behind_wait, atomic_read(&bitmap->behind_writes) == 0); @@ -2288,9 +2289,9 @@ static int raid1_reshape(mddev_t *mddev) if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) printk(KERN_WARNING - "md/raid1: cannot register " - "%s for %s\n", - nm, mdname(mddev)); + "md/raid1:%s: cannot register " + "%s\n", + mdname(mddev), nm); } if (rdev) newmirrors[d2++].rdev = rdev; From 128595ed6ff2c7358ae253a560d47a0af463bc99 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 May 2010 14:47:14 +1000 Subject: [PATCH 38/45] md/raid10: tidy up printk messages. All raid10 printk messages now start md/raid10:md-device-name: Signed-off-by: NeilBrown --- drivers/md/raid10.c | 72 ++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a1d727610a491..e0742c4394843 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -285,7 +285,8 @@ static void raid10_end_read_request(struct bio *bio, int error) */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) - printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", + printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); reschedule_retry(r10_bio); } @@ -831,8 +832,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) bio_pair_release(bp); return 0; bad_map: - printk("raid10_make_request bug: can't convert block across chunks" - " or bigger than %dk %llu %d\n", chunk_sects/2, + printk("md/raid10:%s: make_request bug: can't convert block across chunks" + " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, (unsigned long long)bio->bi_sector, bio->bi_size >> 10); bio_io_error(bio); @@ -1031,9 +1032,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" - "raid10: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" + KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) @@ -1041,19 +1043,19 @@ static void print_conf(conf_t *conf) int i; mirror_info_t *tmp; - printk("RAID10 conf printout:\n"); + printk(KERN_DEBUG "RAID10 conf printout:\n"); if (!conf) { - printk("(!conf)\n"); + printk(KERN_DEBUG "(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->mirrors + i; if (tmp->rdev) - printk(" disk %d, wo:%d, o:%d, dev:%s\n", + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev,b)); @@ -1502,13 +1504,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) if (cur_read_error_count > max_read_errors) { rcu_read_unlock(); printk(KERN_NOTICE - "raid10: %s: Raid device exceeded " + "md/raid10:%s: %s: Raid device exceeded " "read_error threshold " "[cur %d:max %d]\n", + mdname(mddev), b, cur_read_error_count, max_read_errors); printk(KERN_NOTICE - "raid10: %s: Failing raid " - "device\n", b); + "md/raid10:%s: %s: Failing raid " + "device\n", mdname(mddev), b); md_error(mddev, conf->mirrors[d].rdev); return; } @@ -1578,15 +1581,16 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE - "raid10:%s: read correction " + "md/raid10:%s: read correction " "write failed" " (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect+ rdev->data_offset), bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "raid10:%s: failing " + printk(KERN_NOTICE "md/raid10:%s: %s: failing " "drive\n", + mdname(mddev), bdevname(rdev->bdev, b)); md_error(mddev, rdev); } @@ -1614,20 +1618,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) READ) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE - "raid10:%s: unable to read back " + "md/raid10:%s: unable to read back " "corrected sectors" " (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect+ rdev->data_offset), bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "raid10:%s: failing drive\n", + printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", + mdname(mddev), bdevname(rdev->bdev, b)); md_error(mddev, rdev); } else { printk(KERN_INFO - "raid10:%s: read error corrected" + "md/raid10:%s: read error corrected" " (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect+ @@ -1702,8 +1707,9 @@ static void raid10d(mddev_t *mddev) mddev->ro ? IO_BLOCKED : NULL; mirror = read_balance(conf, r10_bio); if (mirror == -1) { - printk(KERN_ALERT "raid10: %s: unrecoverable I/O" + printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" " read error for block %llu\n", + mdname(mddev), bdevname(bio->bi_bdev,b), (unsigned long long)r10_bio->sector); raid_end_bio_io(r10_bio); @@ -1713,8 +1719,9 @@ static void raid10d(mddev_t *mddev) bio_put(bio); rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) - printk(KERN_ERR "raid10: %s: redirecting sector %llu to" + printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" " another mirror\n", + mdname(mddev), bdevname(rdev->bdev,b), (unsigned long long)r10_bio->sector); bio = bio_clone(r10_bio->master_bio, GFP_NOIO); @@ -1972,7 +1979,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i r10_bio = rb2; if (!test_and_set_bit(MD_RECOVERY_INTR, &mddev->recovery)) - printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", + printk(KERN_INFO "md/raid10:%s: insufficient " + "working devices for recovery.\n", mdname(mddev)); break; } @@ -2154,8 +2162,9 @@ static conf_t *setup_conf(mddev_t *mddev) if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || !is_power_of_2(mddev->chunk_sectors)) { - printk(KERN_ERR "md/raid10: chunk size must be " - "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); + printk(KERN_ERR "md/raid10:%s: chunk size must be " + "at least PAGE_SIZE(%ld) and be a power of 2.\n", + mdname(mddev), PAGE_SIZE); goto out; } @@ -2165,7 +2174,7 @@ static conf_t *setup_conf(mddev_t *mddev) if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || (mddev->layout >> 17)) { - printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", + printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", mdname(mddev), mddev->layout); goto out; } @@ -2236,7 +2245,7 @@ static conf_t *setup_conf(mddev_t *mddev) return conf; out: - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", + printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", mdname(mddev)); if (conf) { if (conf->r10bio_pool) @@ -2314,7 +2323,7 @@ static int run(mddev_t *mddev) } /* need to check that every block has at least one working mirror */ if (!enough(conf)) { - printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", + printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", mdname(mddev)); goto out_free_conf; } @@ -2334,11 +2343,11 @@ static int run(mddev_t *mddev) } if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "raid10: %s is not clean" + printk(KERN_NOTICE "md/raid10:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); printk(KERN_INFO - "raid10: raid set %s active with %d out of %d devices\n", + "md/raid10:%s: active with %d out of %d devices\n", mdname(mddev), conf->raid_disks - mddev->degraded, conf->raid_disks); /* @@ -2420,7 +2429,8 @@ static void *raid10_takeover_raid0(mddev_t *mddev) conf_t *conf; if (mddev->degraded > 0) { - printk(KERN_ERR "error: degraded raid0!\n"); + printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } @@ -2458,7 +2468,9 @@ static void *raid10_takeover(mddev_t *mddev) /* for raid0 takeover only one zone is supported */ raid0_priv = mddev->private; if (raid0_priv->nr_strip_zones > 1) { - printk(KERN_ERR "md: cannot takeover raid 0 with more than one zone.\n"); + printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" + " with more than one zone.\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } return raid10_takeover_raid0(mddev); From b5a20961f3479dda48bdc340354ee5469997839d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 May 2010 15:06:27 +1000 Subject: [PATCH 39/45] md/raid0: tidy up printk messages. All messages now start md/raid0:md-device-name: Signed-off-by: NeilBrown --- drivers/md/raid0.c | 110 ++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 45 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 9f9c6b76ca7c3..dc38c1a451665 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -75,10 +75,10 @@ static void dump_zones(mddev_t *mddev) for (j = 0; j < conf->nr_strip_zones; j++) { printk(KERN_INFO "zone%d=[", j); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - printk("%s/", + printk(KERN_CONT "%s/", bdevname(conf->devlist[j*raid_disks + k]->bdev, b)); - printk("]\n"); + printk(KERN_CONT "]\n"); zone_size = conf->strip_zone[j].zone_end - zone_start; printk(KERN_INFO " zone offset=%llukb " @@ -104,8 +104,9 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) if (!conf) return -ENOMEM; list_for_each_entry(rdev1, &mddev->disks, same_set) { - printk(KERN_INFO "raid0: looking at %s\n", - bdevname(rdev1->bdev,b)); + printk(KERN_INFO "md/raid0:%s: looking at %s\n", + mdname(mddev), + bdevname(rdev1->bdev, b)); c = 0; /* round size to chunk_size */ @@ -114,14 +115,16 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) rdev1->sectors = sectors * mddev->chunk_sectors; list_for_each_entry(rdev2, &mddev->disks, same_set) { - printk(KERN_INFO "raid0: comparing %s(%llu)", + printk(KERN_INFO "md/raid0:%s: comparing %s(%llu)", + mdname(mddev), bdevname(rdev1->bdev,b), (unsigned long long)rdev1->sectors); - printk(KERN_INFO " with %s(%llu)\n", + printk(KERN_CONT " with %s(%llu)\n", bdevname(rdev2->bdev,b), (unsigned long long)rdev2->sectors); if (rdev2 == rdev1) { - printk(KERN_INFO "raid0: END\n"); + printk(KERN_INFO "md/raid0:%s: END\n", + mdname(mddev)); break; } if (rdev2->sectors == rdev1->sectors) { @@ -129,20 +132,24 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) * Not unique, don't count it as a new * group */ - printk(KERN_INFO "raid0: EQUAL\n"); + printk(KERN_INFO "md/raid0:%s: EQUAL\n", + mdname(mddev)); c = 1; break; } - printk(KERN_INFO "raid0: NOT EQUAL\n"); + printk(KERN_INFO "md/raid0:%s: NOT EQUAL\n", + mdname(mddev)); } if (!c) { - printk(KERN_INFO "raid0: ==> UNIQUE\n"); + printk(KERN_INFO "md/raid0:%s: ==> UNIQUE\n", + mdname(mddev)); conf->nr_strip_zones++; - printk(KERN_INFO "raid0: %d zones\n", - conf->nr_strip_zones); + printk(KERN_INFO "md/raid0:%s: %d zones\n", + mdname(mddev), conf->nr_strip_zones); } } - printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); + printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n", + mdname(mddev), conf->nr_strip_zones); err = -ENOMEM; conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); @@ -170,13 +177,13 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) j /= 2; if (j < 0 || j >= mddev->raid_disks) { - printk(KERN_ERR "raid0: bad disk number %d - " - "aborting!\n", j); + printk(KERN_ERR "md/raid0:%s: bad disk number %d - " + "aborting!\n", mdname(mddev), j); goto abort; } if (dev[j]) { - printk(KERN_ERR "raid0: multiple devices for %d - " - "aborting!\n", j); + printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " + "aborting!\n", mdname(mddev), j); goto abort; } dev[j] = rdev1; @@ -198,8 +205,8 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) cnt++; } if (cnt != mddev->raid_disks) { - printk(KERN_ERR "raid0: too few disks (%d of %d) - " - "aborting!\n", cnt, mddev->raid_disks); + printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " + "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); goto abort; } zone->nb_dev = cnt; @@ -215,39 +222,44 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) zone = conf->strip_zone + i; dev = conf->devlist + i * mddev->raid_disks; - printk(KERN_INFO "raid0: zone %d\n", i); + printk(KERN_INFO "md/raid0:%s: zone %d\n", + mdname(mddev), i); zone->dev_start = smallest->sectors; smallest = NULL; c = 0; for (j=0; jdevlist[j]; - printk(KERN_INFO "raid0: checking %s ...", - bdevname(rdev->bdev, b)); + printk(KERN_INFO "md/raid0:%s: checking %s ...", + mdname(mddev), + bdevname(rdev->bdev, b)); if (rdev->sectors <= zone->dev_start) { - printk(KERN_INFO " nope.\n"); + printk(KERN_CONT " nope.\n"); continue; } - printk(KERN_INFO " contained as device %d\n", c); + printk(KERN_CONT " contained as device %d\n", c); dev[c] = rdev; c++; if (!smallest || rdev->sectors < smallest->sectors) { smallest = rdev; - printk(KERN_INFO " (%llu) is smallest!.\n", - (unsigned long long)rdev->sectors); + printk(KERN_INFO "md/raid0:%s: (%llu) is smallest!.\n", + mdname(mddev), + (unsigned long long)rdev->sectors); } } zone->nb_dev = c; sectors = (smallest->sectors - zone->dev_start) * c; - printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", - zone->nb_dev, (unsigned long long)sectors); + printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", + mdname(mddev), + zone->nb_dev, (unsigned long long)sectors); curr_zone_end += sectors; zone->zone_end = curr_zone_end; - printk(KERN_INFO "raid0: current zone start: %llu\n", - (unsigned long long)smallest->sectors); + printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n", + mdname(mddev), + (unsigned long long)smallest->sectors); } mddev->queue->unplug_fn = raid0_unplug; mddev->queue->backing_dev_info.congested_fn = raid0_congested; @@ -258,7 +270,7 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) * chunk size is a multiple of that sector size */ if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { - printk(KERN_ERR "%s chunk_size of %d not valid\n", + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", mdname(mddev), mddev->chunk_sectors << 9); goto abort; @@ -268,7 +280,7 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) blk_queue_io_opt(mddev->queue, (mddev->chunk_sectors << 9) * mddev->raid_disks); - printk(KERN_INFO "raid0: done.\n"); + printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; return 0; @@ -331,7 +343,8 @@ static int raid0_run(mddev_t *mddev) int ret; if (mddev->chunk_sectors == 0) { - printk(KERN_ERR "md/raid0: chunk size must be set.\n"); + printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", + mdname(mddev)); return -EINVAL; } if (md_check_no_bitmap(mddev)) @@ -357,8 +370,9 @@ static int raid0_run(mddev_t *mddev) /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); - printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", - (unsigned long long)mddev->array_sectors); + printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", + mdname(mddev), + (unsigned long long)mddev->array_sectors); /* calculate the max read-ahead size. * For read-ahead of large files to be effective, we need to * readahead at least twice a whole stripe. i.e. number of devices @@ -516,9 +530,10 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio) return 1; bad_map: - printk("raid0_make_request bug: can't convert block across chunks" - " or bigger than %dk %llu %d\n", chunk_sects / 2, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + printk("md/raid0:%s: make_request bug: can't convert block across chunks" + " or bigger than %dk %llu %d\n", + mdname(mddev), chunk_sects / 2, + (unsigned long long)bio->bi_sector, bio->bi_size >> 10); bio_io_error(bio); return 0; @@ -563,7 +578,8 @@ static void *raid0_takeover_raid5(mddev_t *mddev) raid0_conf_t *priv_conf; if (mddev->degraded != 1) { - printk(KERN_ERR "md: raid5 must be degraded! Degraded disks: %d\n", + printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", + mdname(mddev), mddev->degraded); return ERR_PTR(-EINVAL); } @@ -571,7 +587,8 @@ static void *raid0_takeover_raid5(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { /* check slot number for a disk */ if (rdev->raid_disk == mddev->raid_disks-1) { - printk(KERN_ERR "md: raid5 must have missing parity disk!\n"); + printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } } @@ -599,16 +616,19 @@ static void *raid0_takeover_raid10(mddev_t *mddev) * - all mirrors must be already degraded */ if (mddev->layout != ((1 << 8) + 2)) { - printk(KERN_ERR "md: Raid0 cannot takover layout: %x\n", + printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", + mdname(mddev), mddev->layout); return ERR_PTR(-EINVAL); } if (mddev->raid_disks & 1) { - printk(KERN_ERR "md: Raid0 cannot takover Raid10 with odd disk number.\n"); + printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } if (mddev->degraded != (mddev->raid_disks>>1)) { - printk(KERN_ERR "md: All mirrors must be already degraded!\n"); + printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } @@ -636,8 +656,8 @@ static void *raid0_takeover(mddev_t *mddev) if (mddev->layout == ALGORITHM_PARITY_N) return raid0_takeover_raid5(mddev); - printk(KERN_ERR "md: Raid can only takeover Raid5 with layout: %d\n", - ALGORITHM_PARITY_N); + printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", + mdname(mddev), ALGORITHM_PARITY_N); } if (mddev->level == 10) From 2dc40f80945ac3e5ec05c3a6c75baf09b13cee51 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 May 2010 15:12:04 +1000 Subject: [PATCH 40/45] md/linear: standardise all printk messages md/linear:mdname: Signed-off-by: NeilBrown --- drivers/md/linear.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 3204a2263f216..d5d5064c4a668 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -158,7 +158,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) sector_t sectors; if (j < 0 || j >= raid_disks || disk->rdev) { - printk("linear: disk numbering problem. Aborting!\n"); + printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); goto out; } @@ -186,7 +187,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) } if (cnt != raid_disks) { - printk("linear: not enough drives present. Aborting!\n"); + printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); goto out; } @@ -305,12 +307,14 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio) || (bio->bi_sector < start_sector))) { char b[BDEVNAME_SIZE]; - printk("linear_make_request: Sector %llu out of bounds on " - "dev %s: %llu sectors, offset %llu\n", - (unsigned long long)bio->bi_sector, - bdevname(tmp_dev->rdev->bdev, b), - (unsigned long long)tmp_dev->rdev->sectors, - (unsigned long long)start_sector); + printk(KERN_ERR + "md/linear:%s: make_request: Sector %llu out of bounds on " + "dev %s: %llu sectors, offset %llu\n", + mdname(mddev), + (unsigned long long)bio->bi_sector, + bdevname(tmp_dev->rdev->bdev, b), + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); rcu_read_unlock(); bio_io_error(bio); return 0; From af3a2cd6b8a479345786e7fe5e199ad2f6240e56 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 8 May 2010 08:20:17 +1000 Subject: [PATCH 41/45] md: Fix read balancing in RAID1 and RAID10 on drives > 2TB read_balance uses a "unsigned long" for a sector number which will get truncated beyond 2TB. This will cause read-balancing to be non-optimal, and can cause data to be read from the 'wrong' branch during a resync. This has a very small chance of returning wrong data. Reported-by: Jordan Russell Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1db02c4955a9c..a9b9972ff703c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -418,7 +418,7 @@ static void raid1_end_write_request(struct bio *bio, int error) */ static int read_balance(conf_t *conf, r1bio_t *r1_bio) { - const unsigned long this_sector = r1_bio->sector; + const sector_t this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; int wonly_disk = -1; const int sectors = r1_bio->sectors; @@ -434,7 +434,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) retry: if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { - /* Choose the first operation device, for consistancy */ + /* Choose the first operational device, for consistancy */ new_disk = 0; for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e0742c4394843..a1f5fd2d69ce3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -495,7 +495,7 @@ static int raid10_mergeable_bvec(struct request_queue *q, */ static int read_balance(conf_t *conf, r10bio_t *r10_bio) { - const unsigned long this_sector = r10_bio->sector; + const sector_t this_sector = r10_bio->sector; int disk, slot, nslot; const int sectors = r10_bio->sectors; sector_t new_distance, current_distance; From 75a73a29e520a6ce982b0da6dd8b7560ae3faa90 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 7 May 2010 19:44:26 +1000 Subject: [PATCH 42/45] md: restore ability of spare drives to spin down. Some time ago we stopped the clean/active metadata updates from being written to a 'spare' device in most cases so that it could spin down and say spun down. Device failure/removal etc are still recorded on spares. However commit 51d5668cb2e3fd1827a55 broke this 50% of the time, depending on whether the event count is even or odd. The change log entry said: This means that the alignment between 'odd/even' and 'clean/dirty' might take a little longer to attain, how ever the code makes no attempt to create that alignment, so it could take arbitrarily long. So when we find that clean/dirty is not aligned with odd/even, force a second metadata-update immediately. There are already cases where a second metadata-update is needed immediately (e.g. when a device fails during the metadata update). We just piggy-back on that. Reported-by: Joe Bryant Signed-off-by: NeilBrown Cc: stable@kernel.org --- drivers/md/md.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b8a0fcfb1de13..fec4abcb9bb4d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2150,12 +2150,18 @@ static void md_update_sb(mddev_t * mddev, int force_change) if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ /* .. if the array isn't clean, an 'even' event must also go * to spares. */ - if ((mddev->events&1)==0) + if ((mddev->events&1)==0) { nospares = 0; + sync_req = 2; /* force a second update to get the + * even/odd in sync */ + } } else { /* otherwise an 'odd' event must go to spares */ - if ((mddev->events&1)) + if ((mddev->events&1)) { nospares = 0; + sync_req = 2; /* force a second update to get the + * even/odd in sync */ + } } } From 7b0bb5368a7195606eca475d9f4e291ab7227052 Mon Sep 17 00:00:00 2001 From: "Gabriele A. Trombetti" Date: Wed, 28 Apr 2010 11:51:17 +1000 Subject: [PATCH 43/45] md/raid6: Fix raid-6 read-error correction in degraded state Fix: Raid-6 was not trying to correct a read-error when in singly-degraded state and was instead dropping one more device, going to doubly-degraded state. This patch fixes this behaviour. Tested-by: Janos Haar Signed-off-by: Gabriele A. Trombetti Reported-by: Janos Haar Signed-off-by: NeilBrown Cc: stable@kernel.org --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cee9f93b35c4c..eacf02a6ec5fa 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1527,7 +1527,7 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); - if (conf->mddev->degraded) + if (conf->mddev->degraded >= conf->max_degraded) printk_rl(KERN_WARNING "md/raid:%s: read error not correctable " "(sector %llu on %s).\n", From a8707c08f4f718bb0ed65499d3f43201f6e41455 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 18 May 2010 09:28:43 +1000 Subject: [PATCH 44/45] md: simplify updating of event count to sometimes avoid updating spares. When updating the event count for a simple clean <-> dirty transition, we try to avoid updating the spares so they can safely spin-down. As the event_counts across an array must be +/- 1, this means decrementing the event_count on a dirty->clean transition. This is not always safe and we have to avoid the unsafe time. We current do this with a misguided idea about it being safe or not depending on whether the event_count is odd or even. This approach only works reliably in a few common instances, but easily falls down. So instead, simply keep internal state concerning whether it is safe or not, and always assume it is not safe when an array is first assembled. Signed-off-by: NeilBrown --- drivers/md/md.c | 26 ++++++-------------------- drivers/md/md.h | 6 ++++++ 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index fec4abcb9bb4d..9ef21d9b8e273 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2088,7 +2088,6 @@ static void sync_sbs(mddev_t * mddev, int nospares) if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && - (rdev->sb_events&1)==0 && rdev->sb_events+1 == mddev->events)) { /* Don't update this superblock */ rdev->sb_loaded = 2; @@ -2141,28 +2140,14 @@ static void md_update_sb(mddev_t * mddev, int force_change) * and 'events' is odd, we can roll back to the previous clean state */ if (nospares && (mddev->in_sync && mddev->recovery_cp == MaxSector) - && (mddev->events & 1) - && mddev->events != 1) + && mddev->can_decrease_events + && mddev->events != 1) { mddev->events--; - else { + mddev->can_decrease_events = 0; + } else { /* otherwise we have to go forward and ... */ mddev->events ++; - if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ - /* .. if the array isn't clean, an 'even' event must also go - * to spares. */ - if ((mddev->events&1)==0) { - nospares = 0; - sync_req = 2; /* force a second update to get the - * even/odd in sync */ - } - } else { - /* otherwise an 'odd' event must go to spares */ - if ((mddev->events&1)) { - nospares = 0; - sync_req = 2; /* force a second update to get the - * even/odd in sync */ - } - } + mddev->can_decrease_events = nospares; } if (!mddev->events) { @@ -4606,6 +4591,7 @@ static void md_clean(mddev_t *mddev) mddev->layout = 0; mddev->max_disks = 0; mddev->events = 0; + mddev->can_decrease_events = 0; mddev->delta_disks = 0; mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index a536f54580970..7ab5ea155452c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -150,6 +150,12 @@ struct mddev_s int external_size; /* size managed * externally */ __u64 events; + /* If the last 'event' was simply a clean->dirty transition, and + * we didn't write it to the spares, then it is safe and simple + * to just decrement the event count on a dirty->clean transition. + * So we record that possibility here. + */ + int can_decrease_events; char uuid[16]; From be6800a73aa2f3dc14744c3b80e676d189789f04 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 18 May 2010 10:17:09 +1000 Subject: [PATCH 45/45] md: don't insist on valid event count for spare devices. Devices which know that they are spares do not really need to have an event count that matches the rest of the array, so there are no data-in-sync issues. It is enough that the uuid matches. So remove the requirement that the event count is up-to-date. We currently still write out and event count on spares, but this allows us in a year or 3 to stop doing that completely. Signed-off-by: NeilBrown --- drivers/md/md.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9ef21d9b8e273..26b3d2879d93d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1070,10 +1070,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->bitmap_info.default_offset; } else if (mddev->pers == NULL) { - /* Insist on good event counter while assembling */ + /* Insist on good event counter while assembling, except + * for spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (sb->disks[rdev->desc_nr].state & ( + (1<events) + return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. @@ -1469,10 +1472,14 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } } else if (mddev->pers == NULL) { - /* Insist of good event counter while assembling */ + /* Insist of good event counter while assembling, except for + * spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + if (ev1 < mddev->events) + return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old.