Skip to content

Commit

Permalink
dm-raid456, md/raid456: fix a deadlock for dm-raid456 while io concur…
Browse files Browse the repository at this point in the history
…rent with reshape

For raid456, if reshape is still in progress, then IO across reshape
position will wait for reshape to make progress. However, for dm-raid,
in following cases reshape will never make progress hence IO will hang:

1) the array is read-only;
2) MD_RECOVERY_WAIT is set;
3) MD_RECOVERY_FROZEN is set;

After commit c467e97 ("md/raid6: use valid sector values to determine
if an I/O should wait on the reshape") fix the problem that IO across
reshape position doesn't wait for reshape, the dm-raid test
shell/lvconvert-raid-reshape.sh start to hang:

[root@fedora ~]# cat /proc/979/stack
[<0>] wait_woken+0x7d/0x90
[<0>] raid5_make_request+0x929/0x1d70 [raid456]
[<0>] md_handle_request+0xc2/0x3b0 [md_mod]
[<0>] raid_map+0x2c/0x50 [dm_raid]
[<0>] __map_bio+0x251/0x380 [dm_mod]
[<0>] dm_submit_bio+0x1f0/0x760 [dm_mod]
[<0>] __submit_bio+0xc2/0x1c0
[<0>] submit_bio_noacct_nocheck+0x17f/0x450
[<0>] submit_bio_noacct+0x2bc/0x780
[<0>] submit_bio+0x70/0xc0
[<0>] mpage_readahead+0x169/0x1f0
[<0>] blkdev_readahead+0x18/0x30
[<0>] read_pages+0x7c/0x3b0
[<0>] page_cache_ra_unbounded+0x1ab/0x280
[<0>] force_page_cache_ra+0x9e/0x130
[<0>] page_cache_sync_ra+0x3b/0x110
[<0>] filemap_get_pages+0x143/0xa30
[<0>] filemap_read+0xdc/0x4b0
[<0>] blkdev_read_iter+0x75/0x200
[<0>] vfs_read+0x272/0x460
[<0>] ksys_read+0x7a/0x170
[<0>] __x64_sys_read+0x1c/0x30
[<0>] do_syscall_64+0xc6/0x230
[<0>] entry_SYSCALL_64_after_hwframe+0x6c/0x74

This is because reshape can't make progress.

For md/raid, the problem doesn't exist because register new sync_thread
doesn't rely on the IO to be done any more:

1) If array is read-only, it can switch to read-write by ioctl/sysfs;
2) md/raid never set MD_RECOVERY_WAIT;
3) If MD_RECOVERY_FROZEN is set, mddev_suspend() doesn't hold
   'reconfig_mutex', hence it can be cleared and reshape can continue by
   sysfs api 'sync_action'.

However, I'm not sure yet how to avoid the problem in dm-raid yet. This
patch on the one hand make sure raid_message() can't change
sync_thread() through raid_message() after presuspend(), on the other
hand detect the above 3 cases before wait for IO do be done in
dm_suspend(), and let dm-raid requeue those IO.

Cc: stable@vger.kernel.org # v6.7+
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Xiao Ni <xni@redhat.com>
Acked-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240305072306.2562024-9-yukuai1@huaweicloud.com
  • Loading branch information
Yu Kuai authored and Song Liu committed Mar 5, 2024
1 parent 5625ff8 commit 41425f9
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 7 deletions.
22 changes: 20 additions & 2 deletions drivers/md/dm-raid.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ struct raid_dev {
#define RT_FLAG_RS_IN_SYNC 6
#define RT_FLAG_RS_RESYNCING 7
#define RT_FLAG_RS_GROW 8
#define RT_FLAG_RS_FROZEN 9

/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
Expand Down Expand Up @@ -3340,7 +3341,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
return DM_MAPIO_REQUEUE;

md_handle_request(mddev, bio);
if (unlikely(!md_handle_request(mddev, bio)))
return DM_MAPIO_REQUEUE;

return DM_MAPIO_SUBMITTED;
}
Expand Down Expand Up @@ -3724,7 +3726,8 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;

if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags) ||
test_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags))
return -EBUSY;

if (!strcasecmp(argv[0], "frozen")) {
Expand Down Expand Up @@ -3808,6 +3811,12 @@ static void raid_presuspend(struct dm_target *ti)
struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md;

/*
* From now on, disallow raid_message() to change sync_thread until
* resume, raid_postsuspend() is too late.
*/
set_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);

if (!reshape_interrupted(mddev))
return;

Expand All @@ -3820,6 +3829,13 @@ static void raid_presuspend(struct dm_target *ti)
mddev->pers->prepare_suspend(mddev);
}

static void raid_presuspend_undo(struct dm_target *ti)
{
struct raid_set *rs = ti->private;

clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
}

static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
Expand Down Expand Up @@ -4085,6 +4101,7 @@ static void raid_resume(struct dm_target *ti)

WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery));
WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
mddev_lock_nointr(mddev);
mddev->ro = 0;
mddev->in_sync = 0;
Expand All @@ -4105,6 +4122,7 @@ static struct target_type raid_target = {
.iterate_devices = raid_iterate_devices,
.io_hints = raid_io_hints,
.presuspend = raid_presuspend,
.presuspend_undo = raid_presuspend_undo,
.postsuspend = raid_postsuspend,
.preresume = raid_preresume,
.resume = raid_resume,
Expand Down
24 changes: 22 additions & 2 deletions drivers/md/md.c
Original file line number Diff line number Diff line change
Expand Up @@ -366,15 +366,15 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio)
return true;
}

void md_handle_request(struct mddev *mddev, struct bio *bio)
bool md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended:
if (is_suspended(mddev, bio)) {
DEFINE_WAIT(__wait);
/* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio);
return;
return true;
}
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
Expand All @@ -390,10 +390,13 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)

if (!mddev->pers->make_request(mddev, bio)) {
percpu_ref_put(&mddev->active_io);
if (!mddev->gendisk && mddev->pers->prepare_suspend)
return false;
goto check_suspended;
}

percpu_ref_put(&mddev->active_io);
return true;
}
EXPORT_SYMBOL(md_handle_request);

Expand Down Expand Up @@ -8733,6 +8736,23 @@ void md_account_bio(struct mddev *mddev, struct bio **bio)
}
EXPORT_SYMBOL_GPL(md_account_bio);

void md_free_cloned_bio(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;

if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;

if (md_io_clone->start_time)
bio_end_io_acct(orig_bio, md_io_clone->start_time);

bio_put(bio);
percpu_ref_put(&mddev->active_io);
}
EXPORT_SYMBOL_GPL(md_free_cloned_bio);

/* md_allow_write(mddev)
* Calling this ensures that the array is marked 'active' so that writes
* may proceed without blocking. It is important to call this before
Expand Down
3 changes: 2 additions & 1 deletion drivers/md/md.h
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,7 @@ extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size);
void md_account_bio(struct mddev *mddev, struct bio **bio);
void md_free_cloned_bio(struct bio *bio);

extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
Expand Down Expand Up @@ -821,7 +822,7 @@ extern void md_stop_writes(struct mddev *mddev);
extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);

extern void md_handle_request(struct mddev *mddev, struct bio *bio);
extern bool md_handle_request(struct mddev *mddev, struct bio *bio);
extern int mddev_suspend(struct mddev *mddev, bool interruptible);
extern void mddev_resume(struct mddev *mddev);
extern void md_idle_sync_thread(struct mddev *mddev);
Expand Down
32 changes: 30 additions & 2 deletions drivers/md/raid5.c
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,7 @@ enum stripe_result {
STRIPE_RETRY,
STRIPE_SCHEDULE_AND_RETRY,
STRIPE_FAIL,
STRIPE_WAIT_RESHAPE,
};

struct stripe_request_ctx {
Expand Down Expand Up @@ -5937,7 +5938,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
return STRIPE_SCHEDULE_AND_RETRY;
ret = STRIPE_SCHEDULE_AND_RETRY;
goto out;
}
}
spin_unlock_irq(&conf->device_lock);
Expand Down Expand Up @@ -6016,6 +6018,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,

out_release:
raid5_release_stripe(sh);
out:
if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) {
bi->bi_status = BLK_STS_RESOURCE;
ret = STRIPE_WAIT_RESHAPE;
pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress");
}
return ret;
}

Expand Down Expand Up @@ -6137,7 +6145,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
if (res == STRIPE_FAIL)
if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE)
break;

if (res == STRIPE_RETRY)
Expand Down Expand Up @@ -6175,6 +6183,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)

if (rw == WRITE)
md_write_end(mddev);
if (res == STRIPE_WAIT_RESHAPE) {
md_free_cloned_bio(bi);
return false;
}

bio_endio(bi);
return true;
}
Expand Down Expand Up @@ -8925,6 +8938,18 @@ static int raid5_start(struct mddev *mddev)
return r5l_start(conf->log);
}

/*
* This is only used for dm-raid456, caller already frozen sync_thread, hence
* if rehsape is still in progress, io that is waiting for reshape can never be
* done now, hence wake up and handle those IO.
*/
static void raid5_prepare_suspend(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;

wake_up(&conf->wait_for_overlap);
}

static struct md_personality raid6_personality =
{
.name = "raid6",
Expand All @@ -8948,6 +8973,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
};
static struct md_personality raid5_personality =
{
Expand All @@ -8972,6 +8998,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
};

static struct md_personality raid4_personality =
Expand All @@ -8997,6 +9024,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
};

static int __init raid5_init(void)
Expand Down

0 comments on commit 41425f9

Please sign in to comment.