diff --git a/Documentation/md.txt b/Documentation/md.txt
index fc94770f44aba..993fba37b7d18 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -357,14 +357,14 @@ Each directory contains:
         written to, that device.
 
       state
-        A file recording the current state of the device in the array
+	A file recording the current state of the device in the array
 	which can be a comma separated list of
 	      faulty   - device has been kicked from active use due to
-                         a detected fault or it has unacknowledged bad
-                         blocks
+			 a detected fault, or it has unacknowledged bad
+			 blocks
 	      in_sync  - device is a fully in-sync member of the array
 	      writemostly - device will only be subject to read
-		         requests if there are no other options.
+			 requests if there are no other options.
 			 This applies only to raid1 arrays.
 	      blocked  - device has failed, and the failure hasn't been
 			 acknowledged yet by the metadata handler.
@@ -374,6 +374,13 @@ Each directory contains:
 			 This includes spares that are in the process
 			 of being recovered to
 	      write_error - device has ever seen a write error.
+	      want_replacement - device is (mostly) working but probably
+			 should be replaced, either due to errors or
+			 due to user request.
+	      replacement - device is a replacement for another active
+			 device with same raid_disk.
+
+
 	This list may grow in future.
 	This can be written to.
 	Writing "faulty"  simulates a failure on the device.
@@ -386,6 +393,13 @@ Each directory contains:
 	Writing "in_sync" sets the in_sync flag.
 	Writing "write_error" sets writeerrorseen flag.
 	Writing "-write_error" clears writeerrorseen flag.
+	Writing "want_replacement" is allowed at any time except to a
+		replacement device or a spare.  It sets the flag.
+	Writing "-want_replacement" is allowed at any time.  It clears
+		the flag.
+	Writing "replacement" or "-replacement" is only allowed before
+		starting the array.  It sets or clears the flag.
+
 
 	This file responds to select/poll. Any change to 'faulty'
 	or 'blocked' causes an event.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0e2288824938c..be569eb41a938 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1714,6 +1714,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		}
 		if (sb->devflags & WriteMostly1)
 			set_bit(WriteMostly, &rdev->flags);
+		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
+			set_bit(Replacement, &rdev->flags);
 	} else /* MULTIPATH are always insync */
 		set_bit(In_sync, &rdev->flags);
 
@@ -1767,6 +1769,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 		sb->recovery_offset =
 			cpu_to_le64(rdev->recovery_offset);
 	}
+	if (test_bit(Replacement, &rdev->flags))
+		sb->feature_map |=
+			cpu_to_le32(MD_FEATURE_REPLACEMENT);
 
 	if (mddev->reshape_position != MaxSector) {
 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -2560,6 +2565,15 @@ state_show(struct md_rdev *rdev, char *page)
 		len += sprintf(page+len, "%swrite_error", sep);
 		sep = ",";
 	}
+	if (test_bit(WantReplacement, &rdev->flags)) {
+		len += sprintf(page+len, "%swant_replacement", sep);
+		sep = ",";
+	}
+	if (test_bit(Replacement, &rdev->flags)) {
+		len += sprintf(page+len, "%sreplacement", sep);
+		sep = ",";
+	}
+
 	return len+sprintf(page+len, "\n");
 }
 
@@ -2628,6 +2642,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 	} else if (cmd_match(buf, "-write_error")) {
 		clear_bit(WriteErrorSeen, &rdev->flags);
 		err = 0;
+	} else if (cmd_match(buf, "want_replacement")) {
+		/* Any non-spare device that is not a replacement can
+		 * become want_replacement at any time, but we then need to
+		 * check if recovery is needed.
+		 */
+		if (rdev->raid_disk >= 0 &&
+		    !test_bit(Replacement, &rdev->flags))
+			set_bit(WantReplacement, &rdev->flags);
+		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+		md_wakeup_thread(rdev->mddev->thread);
+		err = 0;
+	} else if (cmd_match(buf, "-want_replacement")) {
+		/* Clearing 'want_replacement' is always allowed.
+		 * Once replacements starts it is too late though.
+		 */
+		err = 0;
+		clear_bit(WantReplacement, &rdev->flags);
+	} else if (cmd_match(buf, "replacement")) {
+		/* Can only set a device as a replacement when array has not
+		 * yet been started.  Once running, replacement is automatic
+		 * from spares, or by assigning 'slot'.
+		 */
+		if (rdev->mddev->pers)
+			err = -EBUSY;
+		else {
+			set_bit(Replacement, &rdev->flags);
+			err = 0;
+		}
+	} else if (cmd_match(buf, "-replacement")) {
+		/* Similarly, can only clear Replacement before start */
+		if (rdev->mddev->pers)
+			err = -EBUSY;
+		else {
+			clear_bit(Replacement, &rdev->flags);
+			err = 0;
+		}
 	}
 	if (!err)
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -6717,8 +6767,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
 			if (test_bit(Faulty, &rdev->flags)) {
 				seq_printf(seq, "(F)");
 				continue;
-			} else if (rdev->raid_disk < 0)
+			}
+			if (rdev->raid_disk < 0)
 				seq_printf(seq, "(S)"); /* spare */
+			if (test_bit(Replacement, &rdev->flags))
+				seq_printf(seq, "(R)");
 			sectors += rdev->sectors;
 		}
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 670c10e6b4847..44c63dfeeb2b9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -72,34 +72,7 @@ struct md_rdev {
 	 * This reduces the burden of testing multiple flags in many cases
 	 */
 
-	unsigned long	flags;
-#define	Faulty		1		/* device is known to have a fault */
-#define	In_sync		2		/* device is in_sync with rest of array */
-#define	WriteMostly	4		/* Avoid reading if at all possible */
-#define	AutoDetected	7		/* added by auto-detect */
-#define Blocked		8		/* An error occurred but has not yet
-					 * been acknowledged by the metadata
-					 * handler, so don't allow writes
-					 * until it is cleared */
-#define WriteErrorSeen	9		/* A write error has been seen on this
-					 * device
-					 */
-#define FaultRecorded	10		/* Intermediate state for clearing
-					 * Blocked.  The Fault is/will-be
-					 * recorded in the metadata, but that
-					 * metadata hasn't been stored safely
-					 * on disk yet.
-					 */
-#define BlockedBadBlocks 11		/* A writer is blocked because they
-					 * found an unacknowledged bad-block.
-					 * This can safely be cleared at any
-					 * time, and the writer will re-check.
-					 * It may be set at any time, and at
-					 * worst the writer will timeout and
-					 * re-check.  So setting it as
-					 * accurately as possible is good, but
-					 * not absolutely critical.
-					 */
+	unsigned long	flags;	/* bit set of 'enum flag_bits' bits. */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */
@@ -152,6 +125,44 @@ struct md_rdev {
 		sector_t size;		/* in sectors */
 	} badblocks;
 };
+enum flag_bits {
+	Faulty,			/* device is known to have a fault */
+	In_sync,		/* device is in_sync with rest of array */
+	WriteMostly,		/* Avoid reading if at all possible */
+	AutoDetected,		/* added by auto-detect */
+	Blocked,		/* An error occurred but has not yet
+				 * been acknowledged by the metadata
+				 * handler, so don't allow writes
+				 * until it is cleared */
+	WriteErrorSeen,		/* A write error has been seen on this
+				 * device
+				 */
+	FaultRecorded,		/* Intermediate state for clearing
+				 * Blocked.  The Fault is/will-be
+				 * recorded in the metadata, but that
+				 * metadata hasn't been stored safely
+				 * on disk yet.
+				 */
+	BlockedBadBlocks,	/* A writer is blocked because they
+				 * found an unacknowledged bad-block.
+				 * This can safely be cleared at any
+				 * time, and the writer will re-check.
+				 * It may be set at any time, and at
+				 * worst the writer will timeout and
+				 * re-check.  So setting it as
+				 * accurately as possible is good, but
+				 * not absolutely critical.
+				 */
+	WantReplacement,	/* This device is a candidate to be
+				 * hot-replaced, either because it has
+				 * reported some faults, or because
+				 * of explicit request.
+				 */
+	Replacement,		/* This device is a replacement for
+				 * a want_replacement device with same
+				 * raid_disk number.
+				 */
+};
 
 #define BB_LEN_MASK	(0x00000000000001FFULL)
 #define BB_OFFSET_MASK	(0x7FFFFFFFFFFFFE00ULL)
@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
 static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char nm[20];
-	sprintf(nm, "rd%d", rdev->raid_disk);
-	return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+	if (!test_bit(Replacement, &rdev->flags)) {
+		sprintf(nm, "rd%d", rdev->raid_disk);
+		return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+	} else
+		return 0;
 }
 
 static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char nm[20];
-	sprintf(nm, "rd%d", rdev->raid_disk);
-	sysfs_remove_link(&mddev->kobj, nm);
+	if (!test_bit(Replacement, &rdev->flags)) {
+		sprintf(nm, "rd%d", rdev->raid_disk);
+		sysfs_remove_link(&mddev->kobj, nm);
+	}
 }
 
 /*
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 9e65d9e20662d..6f6df86f1ae51 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -277,7 +277,10 @@ struct mdp_superblock_1 {
 					   */
 #define	MD_FEATURE_RESHAPE_ACTIVE	4
 #define	MD_FEATURE_BAD_BLOCKS		8 /* badblock list is not empty */
-
-#define	MD_FEATURE_ALL			(1|2|4|8)
+#define	MD_FEATURE_REPLACEMENT		16 /* This device is replacing an
+					    * active device with same 'role'.
+					    * 'recovery_offset' is also set.
+					    */
+#define	MD_FEATURE_ALL			(1|2|4|8|16)
 
 #endif