Skip to content

Commit

Permalink
Merge tag 'md-3.3' of git://neil.brown.name/md
Browse files Browse the repository at this point in the history
md update for 3.3

Big change is new hot-replacement.
A slot in an array can hold 2 devices - one that
wants-replacement and one that is the replacement.
Once the replacement is built - either from the
original or (in the case of errors) from elsewhere,
the wants-replacement device will be removed.

* tag 'md-3.3' of git://neil.brown.name/md: (36 commits)
  md/raid1: Mark device want_replacement when we see a write error.
  md/raid1: If there is a spare and a want_replacement device, start replacement.
  md/raid1: recognise replacements when assembling arrays.
  md/raid1: handle activation of replacement device when recovery completes.
  md/raid1: Allow a failed replacement device to be removed.
  md/raid1: Allocate spare to store replacement devices and their bios.
  md/raid1:  Replace use of mddev->raid_disks with conf->raid_disks.
  md/raid10: If there is a spare and a want_replacement device, start replacement.
  md/raid10: recognise replacements when assembling array.
  md/raid10: Allow replacement device to be replace old drive.
  md/raid10: handle recovery of replacement devices.
  md/raid10:  Handle replacement devices during resync.
  md/raid10: writes should get directed to replacement as well as original.
  md/raid10: allow removal of failed replacement devices.
  md/raid10: preferentially read from replacement device if possible.
  md/raid10:  change read_balance to return an rdev
  md/raid10: prepare data structures for handling replacement.
  md/raid5: Mark device want_replacement when we see a write error.
  md/raid5: If there is a spare and a want_replacement device, start replacement.
  md/raid5: recognise replacements when assembling array.
  ...
  • Loading branch information
Linus Torvalds committed Jan 8, 2012
2 parents 9879326 + 19d6716 commit 2943c83
Show file tree
Hide file tree
Showing 13 changed files with 1,280 additions and 438 deletions.
22 changes: 18 additions & 4 deletions Documentation/md.txt
Original file line number Diff line number Diff line change
Expand Up @@ -357,14 +357,14 @@ Each directory contains:
written to, that device.

state
A file recording the current state of the device in the array
A file recording the current state of the device in the array
which can be a comma separated list of
faulty - device has been kicked from active use due to
a detected fault or it has unacknowledged bad
blocks
a detected fault, or it has unacknowledged bad
blocks
in_sync - device is a fully in-sync member of the array
writemostly - device will only be subject to read
requests if there are no other options.
requests if there are no other options.
This applies only to raid1 arrays.
blocked - device has failed, and the failure hasn't been
acknowledged yet by the metadata handler.
Expand All @@ -374,6 +374,13 @@ Each directory contains:
This includes spares that are in the process
of being recovered to
write_error - device has ever seen a write error.
want_replacement - device is (mostly) working but probably
should be replaced, either due to errors or
due to user request.
replacement - device is a replacement for another active
device with same raid_disk.


This list may grow in future.
This can be written to.
Writing "faulty" simulates a failure on the device.
Expand All @@ -386,6 +393,13 @@ Each directory contains:
Writing "in_sync" sets the in_sync flag.
Writing "write_error" sets writeerrorseen flag.
Writing "-write_error" clears writeerrorseen flag.
Writing "want_replacement" is allowed at any time except to a
replacement device or a spare. It sets the flag.
Writing "-want_replacement" is allowed at any time. It clears
the flag.
Writing "replacement" or "-replacement" is only allowed before
starting the array. It sets or clears the flag.


This file responds to select/poll. Any change to 'faulty'
or 'blocked' causes an event.
Expand Down
12 changes: 6 additions & 6 deletions drivers/md/bitmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev)
return;
}
if (time_before(jiffies, bitmap->daemon_lastrun
+ bitmap->mddev->bitmap_info.daemon_sleep))
+ mddev->bitmap_info.daemon_sleep))
goto done;

bitmap->daemon_lastrun = jiffies;
if (bitmap->allclean) {
bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
goto done;
}
bitmap->allclean = 1;
Expand Down Expand Up @@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev)
* sure that events_cleared is up-to-date.
*/
if (bitmap->need_sync &&
bitmap->mddev->bitmap_info.external == 0) {
mddev->bitmap_info.external == 0) {
bitmap_super_t *sb;
bitmap->need_sync = 0;
sb = kmap_atomic(bitmap->sb_page, KM_USER0);
Expand Down Expand Up @@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev)

done:
if (bitmap->allclean == 0)
bitmap->mddev->thread->timeout =
bitmap->mddev->bitmap_info.daemon_sleep;
mddev->thread->timeout =
mddev->bitmap_info.daemon_sleep;
mutex_unlock(&mddev->bitmap_info.mutex);
}

Expand Down Expand Up @@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
}
if (!*bmc) {
struct page *page;
*bmc = 1 | (needed ? NEEDED_MASK : 0);
*bmc = 2 | (needed ? NEEDED_MASK : 0);
bitmap_count_page(bitmap, offset, 1);
page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
Expand Down
107 changes: 80 additions & 27 deletions drivers/md/md.c
Original file line number Diff line number Diff line change
Expand Up @@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
}
if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);

Expand Down Expand Up @@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->recovery_offset =
cpu_to_le64(rdev->recovery_offset);
}
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);

if (mddev->reshape_position != MaxSector) {
sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
Expand Down Expand Up @@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%swrite_error", sep);
sep = ",";
}
if (test_bit(WantReplacement, &rdev->flags)) {
len += sprintf(page+len, "%swant_replacement", sep);
sep = ",";
}
if (test_bit(Replacement, &rdev->flags)) {
len += sprintf(page+len, "%sreplacement", sep);
sep = ",";
}

return len+sprintf(page+len, "\n");
}

Expand Down Expand Up @@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "-write_error")) {
clear_bit(WriteErrorSeen, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "want_replacement")) {
/* Any non-spare device that is not a replacement can
* become want_replacement at any time, but we then need to
* check if recovery is needed.
*/
if (rdev->raid_disk >= 0 &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
err = 0;
} else if (cmd_match(buf, "-want_replacement")) {
/* Clearing 'want_replacement' is always allowed.
* Once replacements starts it is too late though.
*/
err = 0;
clear_bit(WantReplacement, &rdev->flags);
} else if (cmd_match(buf, "replacement")) {
/* Can only set a device as a replacement when array has not
* yet been started. Once running, replacement is automatic
* from spares, or by assigning 'slot'.
*/
if (rdev->mddev->pers)
err = -EBUSY;
else {
set_bit(Replacement, &rdev->flags);
err = 0;
}
} else if (cmd_match(buf, "-replacement")) {
/* Similarly, can only clear Replacement before start */
if (rdev->mddev->pers)
err = -EBUSY;
else {
clear_bit(Replacement, &rdev->flags);
err = 0;
}
}
if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state);
Expand Down Expand Up @@ -2688,15 +2738,14 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_remove_disk == NULL)
return -EINVAL;
err = rdev->mddev->pers->
hot_remove_disk(rdev->mddev, rdev->raid_disk);
hot_remove_disk(rdev->mddev, rdev);
if (err)
return err;
sysfs_unlink_rdev(rdev->mddev, rdev);
rdev->raid_disk = -1;
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) {
struct md_rdev *rdev2;
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
*/
Expand All @@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL;

list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
if (rdev2->raid_disk == slot)
return -EEXIST;

if (slot >= rdev->mddev->raid_disks &&
slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
return -ENOSPC;
Expand Down Expand Up @@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
struct mddev *mddev = NULL;
int ro;

if (!capable(CAP_SYS_ADMIN))
return -EACCES;
switch (cmd) {
case RAID_VERSION:
case GET_ARRAY_INFO:
case GET_DISK_INFO:
break;
default:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
}

/*
* Commands dealing with the RAID driver but not any
Expand Down Expand Up @@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)");
continue;
} else if (rdev->raid_disk < 0)
}
if (rdev->raid_disk < 0)
seq_printf(seq, "(S)"); /* spare */
if (test_bit(Replacement, &rdev->flags))
seq_printf(seq, "(R)");
sectors += rdev->sectors;
}

Expand Down Expand Up @@ -7337,29 +7392,27 @@ static int remove_and_add_spares(struct mddev *mddev)
! test_bit(In_sync, &rdev->flags)) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) {
mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
}

if (mddev->degraded) {
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
spares++;
if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
spares++;
md_new_event(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
md_new_event(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
}
Expand Down Expand Up @@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev)
test_bit(Faulty, &rdev->flags) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev->raid_disk)==0) {
mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
Expand Down
82 changes: 49 additions & 33 deletions drivers/md/md.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,34 +72,7 @@ struct md_rdev {
* This reduces the burden of testing multiple flags in many cases
*/

unsigned long flags;
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
#define AutoDetected 7 /* added by auto-detect */
#define Blocked 8 /* An error occurred but has not yet
* been acknowledged by the metadata
* handler, so don't allow writes
* until it is cleared */
#define WriteErrorSeen 9 /* A write error has been seen on this
* device
*/
#define FaultRecorded 10 /* Intermediate state for clearing
* Blocked. The Fault is/will-be
* recorded in the metadata, but that
* metadata hasn't been stored safely
* on disk yet.
*/
#define BlockedBadBlocks 11 /* A writer is blocked because they
* found an unacknowledged bad-block.
* This can safely be cleared at any
* time, and the writer will re-check.
* It may be set at any time, and at
* worst the writer will timeout and
* re-check. So setting it as
* accurately as possible is good, but
* not absolutely critical.
*/
unsigned long flags; /* bit set of 'enum flag_bits' bits. */
wait_queue_head_t blocked_wait;

int desc_nr; /* descriptor index in the superblock */
Expand Down Expand Up @@ -152,6 +125,44 @@ struct md_rdev {
sector_t size; /* in sectors */
} badblocks;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
In_sync, /* device is in_sync with rest of array */
WriteMostly, /* Avoid reading if at all possible */
AutoDetected, /* added by auto-detect */
Blocked, /* An error occurred but has not yet
* been acknowledged by the metadata
* handler, so don't allow writes
* until it is cleared */
WriteErrorSeen, /* A write error has been seen on this
* device
*/
FaultRecorded, /* Intermediate state for clearing
* Blocked. The Fault is/will-be
* recorded in the metadata, but that
* metadata hasn't been stored safely
* on disk yet.
*/
BlockedBadBlocks, /* A writer is blocked because they
* found an unacknowledged bad-block.
* This can safely be cleared at any
* time, and the writer will re-check.
* It may be set at any time, and at
* worst the writer will timeout and
* re-check. So setting it as
* accurately as possible is good, but
* not absolutely critical.
*/
WantReplacement, /* This device is a candidate to be
* hot-replaced, either because it has
* reported some faults, or because
* of explicit request.
*/
Replacement, /* This device is a replacement for
* a want_replacement device with same
* raid_disk number.
*/
};

#define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
Expand Down Expand Up @@ -428,7 +439,7 @@ struct md_personality
*/
void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*hot_remove_disk) (struct mddev *mddev, int number);
int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*spare_active) (struct mddev *mddev);
sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
int (*resize) (struct mddev *mddev, sector_t sectors);
Expand Down Expand Up @@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
if (!test_bit(Replacement, &rdev->flags)) {
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
return 0;
}

static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
if (!test_bit(Replacement, &rdev->flags)) {
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
}

/*
Expand Down
Loading

0 comments on commit 2943c83

Please sign in to comment.