Skip to content

Commit

Permalink
md/md-bitmap: move bitmap_{start, end}write to md upper layer
Browse files Browse the repository at this point in the history
There are two BUG reports that raid5 will hang at
bitmap_startwrite([1],[2]), root cause is that bitmap start write and end
write is unbalanced, it's not quite clear where, and while reviewing raid5
code, it's found that bitmap operations can be optimized. For example,
for a 4 disks raid5, with chunksize=8k, if user issue a IO (0 + 48k) to
the array:

┌────────────────────────────────────────────────────────────┐
│chunk 0                                                     │
│      ┌────────────┬─────────────┬─────────────┬────────────┼
│  sh0 │A0: 0 + 4k  │A1: 8k + 4k  │A2: 16k + 4k │A3: P       │
│      ┼────────────┼─────────────┼─────────────┼────────────┼
│  sh1 │B0: 4k + 4k │B1: 12k + 4k │B2: 20k + 4k │B3: P       │
┼──────┴────────────┴─────────────┴─────────────┴────────────┼
│chunk 1                                                     │
│      ┌────────────┬─────────────┬─────────────┬────────────┤
│  sh2 │C0: 24k + 4k│C1: 32k + 4k │C2: P        │C3: 40k + 4k│
│      ┼────────────┼─────────────┼─────────────┼────────────┼
│  sh3 │D0: 28k + 4k│D1: 36k + 4k │D2: P        │D3: 44k + 4k│
└──────┴────────────┴─────────────┴─────────────┴────────────┘

Before this patch, 4 stripe head will be used, and each sh will attach
bio for 3 disks, and each attached bio will trigger
bitmap_startwrite() once, which means total 12 times.
 - 3 times (0 + 4k), for (A0, A1 and A2)
 - 3 times (4 + 4k), for (B0, B1 and B2)
 - 3 times (8 + 4k), for (C0, C1 and C3)
 - 3 times (12 + 4k), for (D0, D1 and D3)

After this patch, md upper layer will calculate that IO range (0 + 48k)
is corresponding to the bitmap (0 + 16k), and call bitmap_startwrite()
just once.

Noted that this patch will align bitmap ranges to the chunks, for example,
if user issue a IO (0 + 4k) to array:

- Before this patch, 1 time (0 + 4k), for A0;
- After this patch, 1 time (0 + 8k) for chunk 0;

Usually, one bitmap bit will represent more than one disk chunk, and this
doesn't have any difference. And even if user really created a array
that one chunk contain multiple bits, the overhead is that more data
will be recovered after power failure.

Also remove STRIPE_BITMAP_PENDING since it's not used anymore.

[1] https://lore.kernel.org/all/CAJpMwyjmHQLvm6zg1cmQErttNNQPDAAXPKM3xgTjMhbfts986Q@mail.gmail.com/
[2] https://lore.kernel.org/all/ADF7D720-5764-4AF3-B68E-1845988737AA@flyingcircus.io/

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20250109015145.158868-6-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
  • Loading branch information
Yu Kuai authored and Song Liu committed Jan 13, 2025
1 parent 9c89f60 commit cd5fc65
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 56 deletions.
29 changes: 29 additions & 0 deletions drivers/md/md.c
Original file line number Diff line number Diff line change
Expand Up @@ -8745,12 +8745,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);

static void md_bitmap_start(struct mddev *mddev,
struct md_io_clone *md_io_clone)
{
if (mddev->pers->bitmap_sector)
mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
&md_io_clone->sectors);

mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
md_io_clone->sectors);
}

static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
{
mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
md_io_clone->sectors);
}

static void md_end_clone_io(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;

if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
md_bitmap_end(mddev, md_io_clone);

if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;

Expand All @@ -8775,6 +8795,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);

if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
md_io_clone->offset = (*bio)->bi_iter.bi_sector;
md_io_clone->sectors = bio_sectors(*bio);
md_bitmap_start(mddev, md_io_clone);
}

clone->bi_end_io = md_end_clone_io;
clone->bi_private = md_io_clone;
*bio = clone;
Expand All @@ -8793,6 +8819,9 @@ void md_free_cloned_bio(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;

if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
md_bitmap_end(mddev, md_io_clone);

if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;

Expand Down
2 changes: 2 additions & 0 deletions drivers/md/md.h
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,8 @@ struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
sector_t offset;
unsigned long sectors;
struct bio bio_clone;
};

Expand Down
4 changes: 0 additions & 4 deletions drivers/md/raid1.c
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,6 @@ static void close_write(struct r1bio *r1_bio)

if (test_bit(R1BIO_BehindIO, &r1_bio->state))
mddev->bitmap_ops->end_behind_write(mddev);
/* clear the bitmap if all writes complete successfully */
mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
md_write_end(mddev);
}

Expand Down Expand Up @@ -1632,8 +1630,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,

if (test_bit(R1BIO_BehindIO, &r1_bio->state))
mddev->bitmap_ops->start_behind_write(mddev);
mddev->bitmap_ops->startwrite(mddev, r1_bio->sector,
r1_bio->sectors);
first_clone = 0;
}

Expand Down
3 changes: 0 additions & 3 deletions drivers/md/raid10.c
Original file line number Diff line number Diff line change
Expand Up @@ -428,8 +428,6 @@ static void close_write(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;

/* clear the bitmap if all writes complete successfully */
mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
md_write_end(mddev);
}

Expand Down Expand Up @@ -1506,7 +1504,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors);

for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
Expand Down
2 changes: 0 additions & 2 deletions drivers/md/raid5-cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf));
}
}
}
Expand Down
50 changes: 6 additions & 44 deletions drivers/md/raid5.c
Original file line number Diff line number Diff line change
Expand Up @@ -906,8 +906,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
is_full_stripe_write(sh);
is_full_stripe_write(sh);
}

/* we only do back search */
Expand Down Expand Up @@ -3545,29 +3544,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
sh->dev[dd_idx].sector);

if (conf->mddev->bitmap && firstwrite) {
/* Cannot hold spinlock over bitmap_startwrite,
* but must ensure this isn't added to a batch until
* we have added to the bitmap and set bm_seq.
* So set STRIPE_BITMAP_PENDING to prevent
* batching.
* If multiple __add_stripe_bio() calls race here they
* much all set STRIPE_BITMAP_PENDING. So only the first one
* to complete "bitmap_startwrite" gets to set
* STRIPE_BIT_DELAY. This is important as once a stripe
* is added to a batch, STRIPE_BIT_DELAY cannot be changed
* any more.
*/
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
RAID5_STRIPE_SECTORS(conf));
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
if (conf->mddev->bitmap && firstwrite && !sh->batch_head) {
sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
}

Expand Down Expand Up @@ -3618,7 +3597,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
for (i = disks; i--; ) {
struct bio *bi;
int bitmap_end = 0;

if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
struct md_rdev *rdev = conf->disks[i].rdev;
Expand All @@ -3643,8 +3621,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].towrite = NULL;
sh->overwrite_disks = 0;
spin_unlock_irq(&sh->stripe_lock);
if (bi)
bitmap_end = 1;

log_stripe_write_finished(sh);

Expand All @@ -3659,10 +3635,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bio_io_error(bi);
bi = nextbi;
}
if (bitmap_end)
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf));
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
Expand All @@ -3671,7 +3643,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].page = sh->dev[i].orig_page;
}

if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
Expand Down Expand Up @@ -3705,9 +3676,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
}
if (bitmap_end)
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf));
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
Expand Down Expand Up @@ -4056,8 +4024,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
bio_endio(wbi);
wbi = wbi2;
}
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf));

if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
Expand Down Expand Up @@ -4882,8 +4849,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
(1 << STRIPE_COMPUTE_RUN) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
(1 << STRIPE_BATCH_ERR) |
(1 << STRIPE_BITMAP_PENDING)),
(1 << STRIPE_BATCH_ERR)),
"stripe state: %lx\n", sh->state);
WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
(1 << STRIPE_REPLACED)),
Expand Down Expand Up @@ -5774,10 +5740,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
for (d = 0; d < conf->raid_disks - conf->max_degraded;
d++)
mddev->bitmap_ops->startwrite(mddev, sh->sector,
RAID5_STRIPE_SECTORS(conf));
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
Expand Down
3 changes: 0 additions & 3 deletions drivers/md/raid5.h
Original file line number Diff line number Diff line change
Expand Up @@ -371,9 +371,6 @@ enum {
STRIPE_ON_RELEASE_LIST,
STRIPE_BATCH_READY,
STRIPE_BATCH_ERR,
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
* to batch yet.
*/
STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
* this bit is used in two scenarios:
*
Expand Down

0 comments on commit cd5fc65

Please sign in to comment.