Skip to content

Commit

Permalink
Revert "MD: fix lock contention for flush bios"
Browse files Browse the repository at this point in the history
This reverts commit 5a409b4.

This patch has two problems.

1/ it make multiple calls to submit_bio() from inside a make_request_fn.
 The bios thus submitted will be queued on current->bio_list and not
 submitted immediately.  As the bios are allocated from a mempool,
 this can theoretically result in a deadlock - all the pool of requests
 could be in various ->bio_list queues and a subsequent mempool_alloc
 could block waiting for one of them to be released.

2/ It aims to handle a case when there are many concurrent flush requests.
  It handles this by submitting many requests in parallel - all of which
  are identical and so most of which do nothing useful.
  It would be more efficient to just send one lower-level request, but
  allow that to satisfy multiple upper-level requests.

Fixes: 5a409b4 ("MD: fix lock contention for flush bios")
Cc: <stable@vger.kernel.org> # v4.19+
Tested-by: Xiao Ni <xni@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
  • Loading branch information
NeilBrown authored and Jens Axboe committed Apr 1, 2019
1 parent 4f4fd7c commit 4bc034d
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 119 deletions.
159 changes: 55 additions & 104 deletions drivers/md/md.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,24 +132,6 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}

static void * flush_info_alloc(gfp_t gfp_flags, void *data)
{
return kzalloc(sizeof(struct flush_info), gfp_flags);
}
static void flush_info_free(void *flush_info, void *data)
{
kfree(flush_info);
}

static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
{
return kzalloc(sizeof(struct flush_bio), gfp_flags);
}
static void flush_bio_free(void *flush_bio, void *data)
{
kfree(flush_bio);
}

static struct ctl_table_header *raid_table_header;

static struct ctl_table raid_table[] = {
Expand Down Expand Up @@ -423,54 +405,30 @@ static int md_congested(void *data, int bits)
/*
* Generic flush handling for md
*/
static void submit_flushes(struct work_struct *ws)
{
struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
struct mddev *mddev = fi->mddev;
struct bio *bio = fi->bio;

bio->bi_opf &= ~REQ_PREFLUSH;
md_handle_request(mddev, bio);

mempool_free(fi, mddev->flush_pool);
}

static void md_end_flush(struct bio *fbio)
static void md_end_flush(struct bio *bio)
{
struct flush_bio *fb = fbio->bi_private;
struct md_rdev *rdev = fb->rdev;
struct flush_info *fi = fb->fi;
struct bio *bio = fi->bio;
struct mddev *mddev = fi->mddev;
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;

rdev_dec_pending(rdev, mddev);

if (atomic_dec_and_test(&fi->flush_pending)) {
if (bio->bi_iter.bi_size == 0) {
/* an empty barrier - all done */
bio_endio(bio);
mempool_free(fi, mddev->flush_pool);
} else {
INIT_WORK(&fi->flush_work, submit_flushes);
queue_work(md_wq, &fi->flush_work);
}
if (atomic_dec_and_test(&mddev->flush_pending)) {
/* The pre-request flush has finished */
queue_work(md_wq, &mddev->flush_work);
}

mempool_free(fb, mddev->flush_bio_pool);
bio_put(fbio);
bio_put(bio);
}

void md_flush_request(struct mddev *mddev, struct bio *bio)
static void md_submit_flush_data(struct work_struct *ws);

static void submit_flushes(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;
struct flush_info *fi;

fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);

fi->bio = bio;
fi->mddev = mddev;
atomic_set(&fi->flush_pending, 1);

INIT_WORK(&mddev->flush_work, md_submit_flush_data);
atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
Expand All @@ -480,40 +438,59 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
* we reclaim rcu_read_lock
*/
struct bio *bi;
struct flush_bio *fb;
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();

fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
fb->fi = fi;
fb->rdev = rdev;

bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
bio_set_dev(bi, rdev->bdev);
bi->bi_end_io = md_end_flush;
bi->bi_private = fb;
bi->bi_private = rdev;
bio_set_dev(bi, rdev->bdev);
bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;

atomic_inc(&fi->flush_pending);
atomic_inc(&mddev->flush_pending);
submit_bio(bi);

rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
if (atomic_dec_and_test(&mddev->flush_pending))
queue_work(md_wq, &mddev->flush_work);
}

if (atomic_dec_and_test(&fi->flush_pending)) {
if (bio->bi_iter.bi_size == 0) {
/* an empty barrier - all done */
bio_endio(bio);
mempool_free(fi, mddev->flush_pool);
} else {
INIT_WORK(&fi->flush_work, submit_flushes);
queue_work(md_wq, &fi->flush_work);
}
static void md_submit_flush_data(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct bio *bio = mddev->flush_bio;

/*
* must reset flush_bio before calling into md_handle_request to avoid a
* deadlock, because other bios passed md_handle_request suspend check
* could wait for this and below md_handle_request could wait for those
* bios because of suspend check
*/
mddev->flush_bio = NULL;
wake_up(&mddev->sb_wait);

if (bio->bi_iter.bi_size == 0) {
/* an empty barrier - all done */
bio_endio(bio);
} else {
bio->bi_opf &= ~REQ_PREFLUSH;
md_handle_request(mddev, bio);
}
}

void md_flush_request(struct mddev *mddev, struct bio *bio)
{
spin_lock_irq(&mddev->lock);
wait_event_lock_irq(mddev->sb_wait,
!mddev->flush_bio,
mddev->lock);
mddev->flush_bio = bio;
spin_unlock_irq(&mddev->lock);

INIT_WORK(&mddev->flush_work, submit_flushes);
queue_work(md_wq, &mddev->flush_work);
}
EXPORT_SYMBOL(md_flush_request);

static inline struct mddev *mddev_get(struct mddev *mddev)
Expand Down Expand Up @@ -560,6 +537,7 @@ void mddev_init(struct mddev *mddev)
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
spin_lock_init(&mddev->lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
Expand Down Expand Up @@ -5511,22 +5489,6 @@ int md_run(struct mddev *mddev)
if (err)
return err;
}
if (mddev->flush_pool == NULL) {
mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
flush_info_free, mddev);
if (!mddev->flush_pool) {
err = -ENOMEM;
goto abort;
}
}
if (mddev->flush_bio_pool == NULL) {
mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
flush_bio_free, mddev);
if (!mddev->flush_bio_pool) {
err = -ENOMEM;
goto abort;
}
}

spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
Expand Down Expand Up @@ -5686,11 +5648,8 @@ int md_run(struct mddev *mddev)
return 0;

abort:
mempool_destroy(mddev->flush_bio_pool);
mddev->flush_bio_pool = NULL;
mempool_destroy(mddev->flush_pool);
mddev->flush_pool = NULL;

bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
Expand Down Expand Up @@ -5894,14 +5853,6 @@ static void __md_stop(struct mddev *mddev)
mddev->to_remove = &md_redundancy_group;
module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->flush_bio_pool) {
mempool_destroy(mddev->flush_bio_pool);
mddev->flush_bio_pool = NULL;
}
if (mddev->flush_pool) {
mempool_destroy(mddev->flush_pool);
mddev->flush_pool = NULL;
}
}

void md_stop(struct mddev *mddev)
Expand Down
22 changes: 7 additions & 15 deletions drivers/md/md.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,19 +252,6 @@ enum mddev_sb_flags {
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};

#define NR_FLUSH_INFOS 8
#define NR_FLUSH_BIOS 64
struct flush_info {
struct bio *bio;
struct mddev *mddev;
struct work_struct flush_work;
atomic_t flush_pending;
};
struct flush_bio {
struct flush_info *fi;
struct md_rdev *rdev;
};

struct mddev {
void *private;
struct md_personality *pers;
Expand Down Expand Up @@ -470,8 +457,13 @@ struct mddev {
* metadata and bitmap writes
*/

mempool_t *flush_pool;
mempool_t *flush_bio_pool;
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
* the rest of the request (without the REQ_PREFLUSH flag).
*/
struct bio *flush_bio;
atomic_t flush_pending;
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
Expand Down

0 comments on commit 4bc034d

Please sign in to comment.