From 46464411307746e6297a034a9983a22c9dfc5a0c Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 18 May 2017 17:28:47 +0200
Subject: [PATCH 1/5] xen/blkback: fix disconnect while I/Os in flight
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Today disconnecting xen-blkback is broken in case there are still
I/Os in flight: xen_blkif_disconnect() will bail out early without
releasing all resources in the hope it will be called again when
the last request has terminated. This, however, won't happen as
xen_blkif_free() won't be called on termination of the last running
request: xen_blkif_put() won't decrement the blkif refcnt to 0 as
xen_blkif_disconnect() didn't finish before thus some xen_blkif_put()
calls in xen_blkif_disconnect() didn't happen.

To solve this deadlock xen_blkif_disconnect() and
xen_blkif_alloc_rings() shouldn't use xen_blkif_put() and
xen_blkif_get() but use some other way to do their accounting of
resources.

This at once fixes another error in xen_blkif_disconnect(): when it
returned early with -EBUSY for another ring than 0 it would call
xen_blkif_put() again for already handled rings on a subsequent call.
This will lead to inconsistencies in the refcnt handling.

Cc: stable@vger.kernel.org
Signed-off-by: Juergen Gross <jgross@suse.com>
Tested-by: Steven Haigh <netwiz@crc.id.au>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/common.h | 1 +
 drivers/block/xen-blkback/xenbus.c | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index dea61f6ab8cbd..638597b17a38c 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -281,6 +281,7 @@ struct xen_blkif_ring {
 
 	wait_queue_head_t	wq;
 	atomic_t		inflight;
+	bool			active;
 	/* One thread per blkif ring. */
 	struct task_struct	*xenblkd;
 	unsigned int		waiting_reqs;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 1f3dfaa54d871..998915174bb8c 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -159,7 +159,7 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
 		init_waitqueue_head(&ring->shutdown_wq);
 		ring->blkif = blkif;
 		ring->st_print = jiffies;
-		xen_blkif_get(blkif);
+		ring->active = true;
 	}
 
 	return 0;
@@ -249,6 +249,9 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
 		struct xen_blkif_ring *ring = &blkif->rings[r];
 		unsigned int i = 0;
 
+		if (!ring->active)
+			continue;
+
 		if (ring->xenblkd) {
 			kthread_stop(ring->xenblkd);
 			wake_up(&ring->shutdown_wq);
@@ -296,7 +299,7 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
 		BUG_ON(ring->free_pages_num != 0);
 		BUG_ON(ring->persistent_gnt_c != 0);
 		WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
-		xen_blkif_put(blkif);
+		ring->active = false;
 	}
 	blkif->nr_ring_pages = 0;
 	/*

From 71df1d7ccad1c36f7321d6b3b48f2ea42681c363 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 18 May 2017 17:28:48 +0200
Subject: [PATCH 2/5] xen/blkback: don't free be structure too early
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The be structure must not be freed when freeing the blkif structure
isn't done. Otherwise a use-after-free of be when unmapping the ring
used for communicating with the frontend will occur in case of a
late call of xenblk_disconnect() (e.g. due to an I/O still active
when trying to disconnect).

Signed-off-by: Juergen Gross <jgross@suse.com>
Tested-by: Steven Haigh <netwiz@crc.id.au>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/xenbus.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 998915174bb8c..4cdf0490983e4 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -315,9 +315,10 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
 
 static void xen_blkif_free(struct xen_blkif *blkif)
 {
-
-	xen_blkif_disconnect(blkif);
+	WARN_ON(xen_blkif_disconnect(blkif));
 	xen_vbd_free(&blkif->vbd);
+	kfree(blkif->be->mode);
+	kfree(blkif->be);
 
 	/* Make sure everything is drained before shutting down */
 	kmem_cache_free(xen_blkif_cachep, blkif);
@@ -514,8 +515,6 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 		xen_blkif_put(be->blkif);
 	}
 
-	kfree(be->mode);
-	kfree(be);
 	return 0;
 }
 

From a24fa22ce22ae302b3bf8f7008896d52d5d57b8d Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 18 May 2017 17:28:49 +0200
Subject: [PATCH 3/5] xen/blkback: don't use xen_blkif_get() in xen-blkback
 kthread
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no need to use xen_blkif_get()/xen_blkif_put() in the kthread
of xen-blkback. Thread stopping is synchronous and using the blkif
reference counting in the kthread will avoid to ever let the reference
count drop to zero at the end of an I/O running concurrent to
disconnecting and multiple rings.

Setting ring->xenblkd to NULL after stopping the kthread isn't needed
as the kthread does this already.

Signed-off-by: Juergen Gross <jgross@suse.com>
Tested-by: Steven Haigh <netwiz@crc.id.au>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 3 ---
 drivers/block/xen-blkback/xenbus.c  | 1 -
 2 files changed, 4 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 726c32e35db9c..6b14c509f3c7b 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -609,8 +609,6 @@ int xen_blkif_schedule(void *arg)
 	unsigned long timeout;
 	int ret;
 
-	xen_blkif_get(blkif);
-
 	set_freezable();
 	while (!kthread_should_stop()) {
 		if (try_to_freeze())
@@ -665,7 +663,6 @@ int xen_blkif_schedule(void *arg)
 		print_stats(ring);
 
 	ring->xenblkd = NULL;
-	xen_blkif_put(blkif);
 
 	return 0;
 }
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 4cdf0490983e4..792da683e70da 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -255,7 +255,6 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
 		if (ring->xenblkd) {
 			kthread_stop(ring->xenblkd);
 			wake_up(&ring->shutdown_wq);
-			ring->xenblkd = NULL;
 		}
 
 		/* The above kthread_stop() guarantees that at this point we

From 089bc0143f489bd3a4578bdff5f4ca68fb26f341 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Tue, 13 Jun 2017 16:28:27 -0400
Subject: [PATCH 4/5] xen-blkback: don't leak stack data via response ring

Rather than constructing a local structure instance on the stack, fill
the fields directly on the shared ring, just like other backends do.
Build on the fact that all response structure flavors are actually
identical (the old code did make this assumption too).

This is XSA-216.

Cc: stable@vger.kernel.org

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 23 ++++++++++++-----------
 drivers/block/xen-blkback/common.h  | 25 +++++--------------------
 2 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 6b14c509f3c7b..0e824091a12fa 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1433,34 +1433,35 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 static void make_response(struct xen_blkif_ring *ring, u64 id,
 			  unsigned short op, int st)
 {
-	struct blkif_response  resp;
+	struct blkif_response *resp;
 	unsigned long     flags;
 	union blkif_back_rings *blk_rings;
 	int notify;
 
-	resp.id        = id;
-	resp.operation = op;
-	resp.status    = st;
-
 	spin_lock_irqsave(&ring->blk_ring_lock, flags);
 	blk_rings = &ring->blk_rings;
 	/* Place on the response ring for the relevant domain. */
 	switch (ring->blkif->blk_protocol) {
 	case BLKIF_PROTOCOL_NATIVE:
-		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
-		       &resp, sizeof(resp));
+		resp = RING_GET_RESPONSE(&blk_rings->native,
+					 blk_rings->native.rsp_prod_pvt);
 		break;
 	case BLKIF_PROTOCOL_X86_32:
-		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
-		       &resp, sizeof(resp));
+		resp = RING_GET_RESPONSE(&blk_rings->x86_32,
+					 blk_rings->x86_32.rsp_prod_pvt);
 		break;
 	case BLKIF_PROTOCOL_X86_64:
-		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
-		       &resp, sizeof(resp));
+		resp = RING_GET_RESPONSE(&blk_rings->x86_64,
+					 blk_rings->x86_64.rsp_prod_pvt);
 		break;
 	default:
 		BUG();
 	}
+
+	resp->id        = id;
+	resp->operation = op;
+	resp->status    = st;
+
 	blk_rings->common.rsp_prod_pvt++;
 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
 	spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 638597b17a38c..ecb35fe8ca8db 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -75,9 +75,8 @@ extern unsigned int xenblk_max_queues;
 struct blkif_common_request {
 	char dummy;
 };
-struct blkif_common_response {
-	char dummy;
-};
+
+/* i386 protocol version */
 
 struct blkif_x86_32_request_rw {
 	uint8_t        nr_segments;  /* number of segments                   */
@@ -129,14 +128,6 @@ struct blkif_x86_32_request {
 	} u;
 } __attribute__((__packed__));
 
-/* i386 protocol version */
-#pragma pack(push, 4)
-struct blkif_x86_32_response {
-	uint64_t        id;              /* copied from request */
-	uint8_t         operation;       /* copied from request */
-	int16_t         status;          /* BLKIF_RSP_???       */
-};
-#pragma pack(pop)
 /* x86_64 protocol version */
 
 struct blkif_x86_64_request_rw {
@@ -193,18 +184,12 @@ struct blkif_x86_64_request {
 	} u;
 } __attribute__((__packed__));
 
-struct blkif_x86_64_response {
-	uint64_t       __attribute__((__aligned__(8))) id;
-	uint8_t         operation;       /* copied from request */
-	int16_t         status;          /* BLKIF_RSP_???       */
-};
-
 DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
-		  struct blkif_common_response);
+		  struct blkif_response);
 DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
-		  struct blkif_x86_32_response);
+		  struct blkif_response __packed);
 DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
-		  struct blkif_x86_64_response);
+		  struct blkif_response);
 
 union blkif_back_rings {
 	struct blkif_back_ring        native;

From 8e8320c9315c47a6a090188720ccff32a6a6ba18 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 20 Jun 2017 17:56:13 -0600
Subject: [PATCH 5/5] blk-mq: fix performance regression with shared tags

If we have shared tags enabled, then every IO completion will trigger
a full loop of every queue belonging to a tag set, and every hardware
queue for each of those queues, even if nothing needs to be done.
This causes a massive performance regression if you have a lot of
shared devices.

Instead of doing this huge full scan on every IO, add an atomic
counter to the main queue that tracks how many hardware queues have
been marked as needing a restart. With that, we can avoid looking for
restartable queues, if we don't have to.

Max reports that this restores performance. Before this patch, 4K
IOPS was limited to 22-23K IOPS. With the patch, we are running at
950-970K IOPS.

Fixes: 6d8c6c0f97ad ("blk-mq: Restart a single queue if tag sets are shared")
Reported-by: Max Gurtovoy <maxg@mellanox.com>
Tested-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Tested-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   | 58 +++++++++++++++++++++++++++++++++---------
 block/blk-mq-sched.h   |  9 -------
 block/blk-mq.c         | 16 +++++++++---
 include/linux/blkdev.h |  2 ++
 4 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 1f5b692526ae1..0ded5e8463356 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -68,6 +68,45 @@ static void blk_mq_sched_assign_ioc(struct request_queue *q,
 		__blk_mq_sched_assign_ioc(q, rq, bio, ioc);
 }
 
+/*
+ * Mark a hardware queue as needing a restart. For shared queues, maintain
+ * a count of how many hardware queues are marked for restart.
+ */
+static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+		return;
+
+	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		struct request_queue *q = hctx->queue;
+
+		if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+			atomic_inc(&q->shared_hctx_restart);
+	} else
+		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+}
+
+static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+		return false;
+
+	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		struct request_queue *q = hctx->queue;
+
+		if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+			atomic_dec(&q->shared_hctx_restart);
+	} else
+		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+
+	if (blk_mq_hctx_has_pending(hctx)) {
+		blk_mq_run_hw_queue(hctx, true);
+		return true;
+	}
+
+	return false;
+}
+
 struct request *blk_mq_sched_get_request(struct request_queue *q,
 					 struct bio *bio,
 					 unsigned int op,
@@ -266,18 +305,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	return true;
 }
 
-static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
-{
-	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
-		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-		if (blk_mq_hctx_has_pending(hctx)) {
-			blk_mq_run_hw_queue(hctx, true);
-			return true;
-		}
-	}
-	return false;
-}
-
 /**
  * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
  * @pos:    loop cursor.
@@ -309,6 +336,13 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
 	unsigned int i, j;
 
 	if (set->flags & BLK_MQ_F_TAG_SHARED) {
+		/*
+		 * If this is 0, then we know that no hardware queues
+		 * have RESTART marked. We're done.
+		 */
+		if (!atomic_read(&queue->shared_hctx_restart))
+			return;
+
 		rcu_read_lock();
 		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
 					   tag_set_list) {
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index edafb5383b7bb..5007edece51ac 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -115,15 +115,6 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
-/*
- * Mark a hardware queue as needing a restart.
- */
-static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
-{
-	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-}
-
 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
 {
 	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index bb66c96850b18..958cedaff8b82 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2103,20 +2103,30 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	}
 }
 
+/*
+ * Caller needs to ensure that we're either frozen/quiesced, or that
+ * the queue isn't live yet.
+ */
 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (shared)
+		if (shared) {
+			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+				atomic_inc(&q->shared_hctx_restart);
 			hctx->flags |= BLK_MQ_F_TAG_SHARED;
-		else
+		} else {
+			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+				atomic_dec(&q->shared_hctx_restart);
 			hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+		}
 	}
 }
 
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
+static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
+					bool shared)
 {
 	struct request_queue *q;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b74a3edcb3da8..1ddd36bd2173b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -391,6 +391,8 @@ struct request_queue {
 	int			nr_rqs[2];	/* # allocated [a]sync rqs */
 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
+	atomic_t		shared_hctx_restart;
+
 	struct blk_queue_stats	*stats;
 	struct rq_wb		*rq_wb;