From 326e1dbb57368087a36607aaebe9795b8d5453e5 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 22 May 2015 09:14:03 -0400
Subject: [PATCH 1/6] block: remove management of bi_remaining when restoring
 original bi_end_io

Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for
non-chains") regressed all existing callers that followed this pattern:
 1) saving a bio's original bi_end_io
 2) wiring up an intermediate bi_end_io
 3) restoring the original bi_end_io from intermediate bi_end_io
 4) calling bio_endio() to execute the restored original bi_end_io

The regression was due to BIO_CHAIN only ever getting set if
bio_inc_remaining() is called.  For the above pattern it isn't set until
step 3 above (step 2 would've needed to establish BIO_CHAIN).  As such
the first bio_endio(), in step 2 above, never decremented __bi_remaining
before calling the intermediate bi_end_io -- leaving __bi_remaining with
the value 1 instead of 0.  When bio_inc_remaining() occurred during step
3 it brought it to a value of 2.  When the second bio_endio() was
called, in step 4 above, it should've called the original bi_end_io but
it didn't because there was an extra reference that wasn't dropped (due
to atomic operations being optimized away since BIO_CHAIN wasn't set
upfront).

Fix this issue by removing the __bi_remaining management complexity for
all callers that use the above pattern -- bio_chain() is the only
interface that _needs_ to be concerned with __bi_remaining.  For the
above pattern callers just expect the bi_end_io they set to get called!
Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls
that aren't associated with the bio_chain() interface.

Also, the bio_inc_remaining() interface has been moved local to bio.c.

Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c        |  4 ++--
 block/bio.c                  | 35 ++++++++++++++---------------------
 drivers/md/bcache/io.c       |  2 +-
 drivers/md/dm-cache-target.c |  6 ------
 drivers/md/dm-raid1.c        |  2 --
 drivers/md/dm-snap.c         |  1 -
 drivers/md/dm-thin.c         |  9 +++------
 drivers/md/dm-verity.c       |  2 +-
 fs/btrfs/disk-io.c           |  2 +-
 fs/btrfs/volumes.c           | 16 +++++-----------
 fs/btrfs/volumes.h           |  2 --
 include/linux/bio.h          | 12 ------------
 12 files changed, 27 insertions(+), 66 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5cbd5d9ea61dd..0436c21db7f23 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 /**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 */
 	if (error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio_nodec(bio, error);
+		bio_endio(bio, error);
 
 		return;
 	}
diff --git a/block/bio.c b/block/bio.c
index c2ff8a88aef15..259197d97de1a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
 	bio_put(bio);
 }
 
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
 /**
  * bio_chain - chain bio completions
  * @bio: the target bio
@@ -1756,8 +1767,10 @@ static inline bool bio_remaining_done(struct bio *bio)
 
 	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
 
-	if (atomic_dec_and_test(&bio->__bi_remaining))
+	if (atomic_dec_and_test(&bio->__bi_remaining)) {
+		clear_bit(BIO_CHAIN, &bio->bi_flags);
 		return true;
+	}
 
 	return false;
 }
@@ -1808,26 +1821,6 @@ void bio_endio(struct bio *bio, int error)
 }
 EXPORT_SYMBOL(bio_endio);
 
-/**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio:	bio
- * @error:	error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
-	/*
-	 * If it's not flagged as a chain, we are not going to dec the count
-	 */
-	if (bio_flagged(bio, BIO_CHAIN))
-		bio_inc_remaining(bio);
-
-	bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
 /**
  * bio_split - split a bio
  * @bio:	bio to split
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fa028fa82df41..cb64e64a47895 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -55,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
 
 	s->bio->bi_end_io = s->bi_end_io;
 	s->bio->bi_private = s->bi_private;
-	bio_endio_nodec(s->bio, 0);
+	bio_endio(s->bio, 0);
 
 	closure_debug_destroy(&s->cl);
 	mempool_free(s, s->p->bio_split_hook);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 705eb7b99d691..41b2594a80c63 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -86,12 +86,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 {
 	bio->bi_end_io = h->bi_end_io;
 	bio->bi_private = h->bi_private;
-
-	/*
-	 * Must bump bi_remaining to allow bio to complete with
-	 * restored bi_end_io.
-	 */
-	bio_inc_remaining(bio);
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d6a1c096b7771..743fa9bbae9ea 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1254,8 +1254,6 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 			dm_bio_restore(bd, bio);
 			bio_record->details.bi_bdev = NULL;
 
-			bio_inc_remaining(bio);
-
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
 		}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 8bfeae2185316..7c82d3ccce871 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1478,7 +1478,6 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
-		bio_inc_remaining(full_bio);
 	}
 	increment_pending_exceptions_done_count();
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 342dbdad61318..e852602c00919 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -793,10 +793,9 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-	if (m->bio) {
+	if (m->bio)
 		m->bio->bi_end_io = m->saved_bi_end_io;
-		bio_inc_remaining(m->bio);
-	}
+
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
@@ -810,10 +809,8 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	int r;
 
 	bio = m->bio;
-	if (bio) {
+	if (bio)
 		bio->bi_end_io = m->saved_bi_end_io;
-		bio_inc_remaining(bio);
-	}
 
 	if (m->err) {
 		cell_error(pool, m->cell);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 66616db33e6fd..bb9c6a00e4b05 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -459,7 +459,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_private = io->orig_bi_private;
 
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static void verity_work(struct work_struct *w)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e08a926fe12c0..0bccf18dc1dca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1745,7 +1745,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static int cleaner_kthread(void *arg)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8e8d1d1e28a52..dac77d42a9ab4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5585,10 +5585,10 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
 {
-	if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
-		bio_endio_nodec(bio, err);
-	else
-		bio_endio(bio, err);
+	bio->bi_private = bbio->private;
+	bio->bi_end_io = bbio->end_io;
+	bio_endio(bio, err);
+
 	btrfs_put_bbio(bbio);
 }
 
@@ -5632,8 +5632,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
 			bio = bbio->orig_bio;
 		}
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the btrfs bio
@@ -5815,8 +5813,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 		/* Shoud be the original bio. */
 		WARN_ON(bio != bbio->orig_bio);
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		bio->bi_iter.bi_sector = logical >> 9;
 
@@ -5897,10 +5893,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		if (dev_nr < total_devs - 1) {
 			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
 			BUG_ON(!bio); /* -ENOMEM */
-		} else {
+		} else
 			bio = first_bio;
-			bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
-		}
 
 		submit_stripe_bio(root, bbio, bio,
 				  bbio->stripes[dev_nr].physical, dev_nr, rw,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ebc31331a8374..cedae0356558d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,8 +292,6 @@ struct btrfs_bio_stripe {
 struct btrfs_bio;
 typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED	(1 << 0)
-
 struct btrfs_bio {
 	atomic_t refs;
 	atomic_t stripes_pending;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7486ea103f6e3..f0291cf64cc5f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -427,7 +427,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
 }
 
 extern void bio_endio(struct bio *, int);
-extern void bio_endio_nodec(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
@@ -658,17 +657,6 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 	return bio;
 }
 
-/*
- * Increment chain count for the bio. Make sure the CHAIN flag update
- * is visible before the raised count.
- */
-static inline void bio_inc_remaining(struct bio *bio)
-{
-	bio->bi_flags |= (1 << BIO_CHAIN);
-	smp_mb__before_atomic();
-	atomic_inc(&bio->__bi_remaining);
-}
-
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.

From 5f1b670d0bef508a5554d92525f5f6d00d640b38 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 May 2015 09:14:04 -0400
Subject: [PATCH 2/6] block, dm: don't copy bios for request clones

Currently dm-multipath has to clone the bios for every request sent
to the lower devices, which wastes cpu cycles and ties down memory.

This patch instead adds a new REQ_CLONE flag that instructs req_bio_endio
to not complete bios attached to a request, which we set on clone
requests similar to bios in a flush sequence.  With this change I/O
errors on a path failure only get propagated to dm-multipath, which
can then either resubmit the I/O or complete the bios on the original
request.

I've done some basic testing of this on a Linux target with ALUA support,
and it survives path failures during I/O nicely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          |  94 +++------------------
 drivers/md/dm-table.c     |  25 ++++--
 drivers/md/dm.c           | 171 +++++++++-----------------------------
 drivers/md/dm.h           |   5 +-
 include/linux/blk_types.h |   2 +
 include/linux/blkdev.h    |   6 +-
 6 files changed, 73 insertions(+), 230 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index de474b5dee2b9..aa819a58ea24a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	if (error)
+	if (error && !(rq->cmd_flags & REQ_CLONE))
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
@@ -128,7 +128,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 &&
+	    !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
 		bio_endio(bio, error);
 }
 
@@ -2909,95 +2910,22 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
-/**
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- *     Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
-	struct bio *bio;
-
-	while ((bio = rq->bio) != NULL) {
-		rq->bio = bio->bi_next;
-
-		bio_put(bio);
-	}
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-
-/*
- * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
- */
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
+void blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
+	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
+	dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
-}
-
-/**
- * blk_rq_prep_clone - Helper function to setup clone request
- * @rq: the request to be setup
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- *           Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
- *     are not copied, and copying such parts is the caller's responsibility.
- *     Also, pages which the original bios are pointing to are not copied
- *     and the cloned bios just point same pages.
- *     So cloned bios must be completed before original bios, which means
- *     the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-		      struct bio_set *bs, gfp_t gfp_mask,
-		      int (*bio_ctr)(struct bio *, struct bio *, void *),
-		      void *data)
-{
-	struct bio *bio, *bio_src;
-
-	if (!bs)
-		bs = fs_bio_set;
-
-	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_clone_fast(bio_src, gfp_mask, bs);
-		if (!bio)
-			goto free_and_out;
-
-		if (bio_ctr && bio_ctr(bio, bio_src, data))
-			goto free_and_out;
-
-		if (rq->bio) {
-			rq->biotail->bi_next = bio;
-			rq->biotail = bio;
-		} else
-			rq->bio = rq->biotail = bio;
-	}
-
-	__blk_rq_prep_clone(rq, rq_src);
-
-	return 0;
-
-free_and_out:
-	if (bio)
-		bio_put(bio);
-	blk_rq_unprep_clone(rq);
-
-	return -ENOMEM;
+	dst->bio = src->bio;
+	dst->biotail = src->biotail;
+	dst->cmd = src->cmd;
+	dst->cmd_len = src->cmd_len;
+	dst->sense = src->sense;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index d9b00b8565c6d..3662b2e49b8dd 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -940,21 +940,28 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
-	struct dm_target *tgt;
 	unsigned i;
 
-	if (unlikely(type == DM_TYPE_NONE)) {
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
+		for (i = 0; i < t->num_targets; i++) {
+			struct dm_target *tgt = t->targets + i;
+
+			per_bio_data_size = max(per_bio_data_size,
+						tgt->per_bio_data_size);
+		}
+		t->mempools = dm_alloc_bio_mempools(t->integrity_supported,
+						    per_bio_data_size);
+		break;
+	case DM_TYPE_REQUEST_BASED:
+	case DM_TYPE_MQ_REQUEST_BASED:
+		t->mempools = dm_alloc_rq_mempools(md, type);
+		break;
+	default:
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 
-	if (type == DM_TYPE_BIO_BASED)
-		for (i = 0; i < t->num_targets; i++) {
-			tgt = t->targets + i;
-			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
-		}
-
-	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a930b72314ac9..38837f8ea3279 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -990,57 +990,6 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
-/*
- * Partial completion handling for request-based dm
- */
-static void end_clone_bio(struct bio *clone, int error)
-{
-	struct dm_rq_clone_bio_info *info =
-		container_of(clone, struct dm_rq_clone_bio_info, clone);
-	struct dm_rq_target_io *tio = info->tio;
-	struct bio *bio = info->orig;
-	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
-
-	bio_put(clone);
-
-	if (tio->error)
-		/*
-		 * An error has already been detected on the request.
-		 * Once error occurred, just let clone->end_io() handle
-		 * the remainder.
-		 */
-		return;
-	else if (error) {
-		/*
-		 * Don't notice the error to the upper layer yet.
-		 * The error handling decision is made by the target driver,
-		 * when the request is completed.
-		 */
-		tio->error = error;
-		return;
-	}
-
-	/*
-	 * I/O for the bio successfully completed.
-	 * Notice the data completion to the upper layer.
-	 */
-
-	/*
-	 * bios are processed from the head of the list.
-	 * So the completing bio should always be rq->bio.
-	 * If it's not, something wrong is happening.
-	 */
-	if (tio->orig->bio != bio)
-		DMERR("bio completion is going in the middle of the request");
-
-	/*
-	 * Update the original request.
-	 * Do not use blk_end_request() here, because it may complete
-	 * the original request before the clone, and break the ordering.
-	 */
-	blk_update_request(tio->orig, 0, nr_bytes);
-}
-
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
@@ -1089,8 +1038,6 @@ static void free_rq_clone(struct request *clone, bool must_be_mapped)
 
 	WARN_ON_ONCE(must_be_mapped && !clone->q);
 
-	blk_rq_unprep_clone(clone);
-
 	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
@@ -1821,39 +1768,13 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 		dm_complete_request(rq, r);
 }
 
-static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
-				 void *data)
+static void setup_clone(struct request *clone, struct request *rq,
+		        struct dm_rq_target_io *tio)
 {
-	struct dm_rq_target_io *tio = data;
-	struct dm_rq_clone_bio_info *info =
-		container_of(bio, struct dm_rq_clone_bio_info, clone);
-
-	info->orig = bio_orig;
-	info->tio = tio;
-	bio->bi_end_io = end_clone_bio;
-
-	return 0;
-}
-
-static int setup_clone(struct request *clone, struct request *rq,
-		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	int r;
-
-	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
-			      dm_rq_bio_constructor, tio);
-	if (r)
-		return r;
-
-	clone->cmd = rq->cmd;
-	clone->cmd_len = rq->cmd_len;
-	clone->sense = rq->sense;
+	blk_rq_prep_clone(clone, rq);
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
-
 	tio->clone = clone;
-
-	return 0;
 }
 
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
@@ -1874,12 +1795,7 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
-	if (setup_clone(clone, rq, tio, gfp_mask)) {
-		/* -ENOMEM */
-		if (alloc_clone)
-			free_clone_request(md, clone);
-		return NULL;
-	}
+	setup_clone(clone, rq, tio);
 
 	return clone;
 }
@@ -1973,11 +1889,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		}
 		if (IS_ERR(clone))
 			return DM_MAPIO_REQUEUE;
-		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
-			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
-			return DM_MAPIO_REQUEUE;
-		}
+		setup_clone(clone, rq, tio);
 	}
 
 	switch (r) {
@@ -2431,8 +2343,6 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
-
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
@@ -3536,48 +3446,23 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size)
 {
-	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	struct kmem_cache *cachep = NULL;
-	unsigned int pool_size = 0;
+	struct dm_md_mempools *pools;
+	unsigned int pool_size = dm_get_reserved_bio_based_ios();
 	unsigned int front_pad;
 
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
 		return NULL;
 
-	type = filter_md_type(type, md);
+	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
+		offsetof(struct dm_target_io, clone);
 
-	switch (type) {
-	case DM_TYPE_BIO_BASED:
-		cachep = _io_cache;
-		pool_size = dm_get_reserved_bio_based_ios();
-		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-		break;
-	case DM_TYPE_REQUEST_BASED:
-		cachep = _rq_tio_cache;
-		pool_size = dm_get_reserved_rq_based_ios();
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-		/* fall through to setup remaining rq-based pools */
-	case DM_TYPE_MQ_REQUEST_BASED:
-		if (!pool_size)
-			pool_size = dm_get_reserved_rq_based_ios();
-		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
-		/* per_bio_data_size is not used. See __bind_mempools(). */
-		WARN_ON(per_bio_data_size != 0);
-		break;
-	default:
-		BUG();
-	}
-
-	if (cachep) {
-		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-		if (!pools->io_pool)
-			goto out;
-	}
+	pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
+	if (!pools->io_pool)
+		goto out;
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3587,10 +3472,34 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		goto out;
 
 	return pools;
-
 out:
 	dm_free_md_mempools(pools);
+	return NULL;
+}
+
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
+					    unsigned type)
+{
+	unsigned int pool_size = dm_get_reserved_rq_based_ios();
+	struct dm_md_mempools *pools;
 
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	if (!pools)
+		return NULL;
+
+	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
+		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+		if (!pools->rq_pool)
+			goto out;
+	}
+
+	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
+	if (!pools->io_pool)
+		goto out;
+
+	return pools;
+out:
+	dm_free_md_mempools(pools);
 	return NULL;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6123c2bf9150c..e6e66d087b269 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -222,8 +222,9 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3f4ded0b1a342..45a6be89957c9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -192,6 +192,7 @@ enum rq_flag_bits {
 	__REQ_HASHED,		/* on IO scheduler merge hash */
 	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NO_TIMEOUT,	/* requests may never expire */
+	__REQ_CLONE,		/* cloned bios */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -246,5 +247,6 @@ enum rq_flag_bits {
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 #define REQ_NO_TIMEOUT		(1ULL << __REQ_NO_TIMEOUT)
+#define REQ_CLONE		(1ULL << __REQ_CLONE)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc917956a6d0b..9ded80da2c165 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -775,11 +775,7 @@ extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
-extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-			     struct bio_set *bs, gfp_t gfp_mask,
-			     int (*bio_ctr)(struct bio *, struct bio *, void *),
-			     void *data);
-extern void blk_rq_unprep_clone(struct request *rq);
+extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_delay_queue(struct request_queue *, unsigned long);

From f6454b049d81bb3d732b6a8afde08420589c6af9 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Tue, 26 May 2015 21:59:53 +0200
Subject: [PATCH 3/6] block: fix returnvar.cocci warnings

Remove unneeded variable used to store return value.

Generated by: scripts/coccinelle/misc/returnvar.cocci

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/buffer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index efd85e0e86600..f96173ad62d90 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2996,7 +2996,6 @@ void guard_bio_eod(int rw, struct bio *bio)
 int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 {
 	struct bio *bio;
-	int ret = 0;
 
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
@@ -3038,7 +3037,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 		rw |= REQ_PRIO;
 
 	submit_bio(rw, bio);
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(_submit_bh);
 

From beefa6ba7bf304d3de3a02cb7366fb0a7d6b27ab Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 29 May 2015 13:10:23 -0600
Subject: [PATCH 4/6] block: only honor SG gap prevention for merges that
 contain data

We can safely merge anything that wont generate an SG list entry,
so if the bio is data-less (discard), don't look at potential
SG gaps.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index fd3fee81c23ce..30a0d9f890170 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -589,7 +589,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 
-	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
+	/* Only check gaps if the bio carries data */
+	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
 		struct bio_vec *bprev;
 
 		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];

From e548ca4ee4595f65b262661d166310ad8a149bec Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 29 May 2015 13:11:32 -0600
Subject: [PATCH 5/6] block: don't honor chunk sizes for data-less IO

We don't need to honor chunk sizes for IO that doesn't carry any
data.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9ded80da2c165..ccaa9aecd5930 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -903,7 +903,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 	if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
 		return q->limits.max_hw_sectors;
 
-	if (!q->limits.chunk_sectors)
+	if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD))
 		return blk_queue_get_max_sectors(q, rq->cmd_flags);
 
 	return min(blk_max_size_offset(q, blk_rq_pos(rq)),

From f26cdc8536ad50fb802a0445f836b4f94ca09ae7 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 1 Jun 2015 09:29:53 -0600
Subject: [PATCH 6/6] blk-mq: Shared tag enhancements

Storage controllers may expose multiple block devices that share hardware
resources managed by blk-mq. This patch enhances the shared tags so a
low-level driver can access the shared resources not tied to the unshared
h/w contexts. This way the LLD can dynamically add and delete disks and
request queues without having to track all the request_queue hctx's to
iterate outstanding tags.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c     | 38 ++++++++++++++++++++++++++++++++++++++
 block/blk-mq-tag.h     |  1 +
 block/blk-mq.c         | 12 ++++++++++--
 include/linux/blk-mq.h |  4 ++++
 4 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index be3290cc0644e..9b6e28830b823 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
+static void bt_tags_for_each(struct blk_mq_tags *tags,
+		struct blk_mq_bitmap_tags *bt, unsigned int off,
+		busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+	struct request *rq;
+	int bit, i;
+
+	if (!tags->rqs)
+		return;
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		for (bit = find_first_bit(&bm->word, bm->depth);
+		     bit < bm->depth;
+		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+			rq = blk_mq_tag_to_rq(tags, off + bit);
+			fn(rq, data, reserved);
+		}
+
+		off += (1 << bt->bits_per_word);
+	}
+}
+
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv)
+{
+	if (tags->nr_reserved_tags)
+		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+			false);
+}
+EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
+
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv)
 {
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;
 
+	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
+		kfree(tags);
+		return NULL;
+	}
+
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 90767b370308d..75893a34237d2 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,6 +44,7 @@ struct blk_mq_tags {
 	struct list_head page_list;
 
 	int alloc_policy;
+	cpumask_var_t cpumask;
 };
 
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c382a34fe5ac6..ef100fd2cb86a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1525,7 +1525,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			i++;
 		}
 	}
-
 	return tags;
 
 fail:
@@ -1821,6 +1820,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
+		cpumask_set_cpu(i, hctx->tags->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
@@ -2187,6 +2187,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 	return 0;
 }
 
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+	return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -2248,8 +2254,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	int i;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (set->tags[i])
+		if (set->tags[i]) {
 			blk_mq_free_rq_map(set, set->tags[i], i);
+			free_cpumask_var(set->tags[i]->cpumask);
+		}
 	}
 
 	kfree(set->tags);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2056a99b92f8c..37d1602c4f7aa 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -96,6 +96,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int,
 
 typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
+typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
 
 struct blk_mq_ops {
 	/*
@@ -182,6 +183,7 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		gfp_t gfp, bool reserved);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
 
 enum {
 	BLK_MQ_UNIQUE_TAG_BITS = 16,
@@ -224,6 +226,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);