From 222f58ac68cd6f98a52faf9f26a219c996a5aa17 Mon Sep 17 00:00:00 2001
From: Azeem Shaikh <azeemshaikh38@gmail.com>
Date: Mon, 3 Jul 2023 17:21:58 +0000
Subject: [PATCH 001/113] kyber: Replace strlcpy with strscpy

strlcpy() reads the entire source buffer first.
This read may exceed the destination size limit.
This is both inefficient and can lead to linear read
overflows if a source string is not NUL-terminated [1].
In an effort to remove strlcpy() completely [2], replace
strlcpy() here with strscpy().
No return values were used, so direct replacement is safe.

[1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy
[2] https://github.com/KSPP/linux/issues/89

Signed-off-by: Azeem Shaikh <azeemshaikh38@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20230703172159.3668349-2-azeemshaikh38@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/kyber.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h
index bf7533f171ff9..9d44781efc1c4 100644
--- a/include/trace/events/kyber.h
+++ b/include/trace/events/kyber.h
@@ -31,8 +31,8 @@ TRACE_EVENT(kyber_latency,
 
 	TP_fast_assign(
 		__entry->dev		= dev;
-		strlcpy(__entry->domain, domain, sizeof(__entry->domain));
-		strlcpy(__entry->type, type, sizeof(__entry->type));
+		strscpy(__entry->domain, domain, sizeof(__entry->domain));
+		strscpy(__entry->type, type, sizeof(__entry->type));
 		__entry->percentile	= percentile;
 		__entry->numerator	= numerator;
 		__entry->denominator	= denominator;
@@ -59,7 +59,7 @@ TRACE_EVENT(kyber_adjust,
 
 	TP_fast_assign(
 		__entry->dev		= dev;
-		strlcpy(__entry->domain, domain, sizeof(__entry->domain));
+		strscpy(__entry->domain, domain, sizeof(__entry->domain));
 		__entry->depth		= depth;
 	),
 
@@ -81,7 +81,7 @@ TRACE_EVENT(kyber_throttled,
 
 	TP_fast_assign(
 		__entry->dev		= dev;
-		strlcpy(__entry->domain, domain, sizeof(__entry->domain));
+		strscpy(__entry->domain, domain, sizeof(__entry->domain));
 	),
 
 	TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev),

From 16291561e125ef50cb1875227fce9c1583f0de3c Mon Sep 17 00:00:00 2001
From: Azeem Shaikh <azeemshaikh38@gmail.com>
Date: Mon, 3 Jul 2023 17:21:59 +0000
Subject: [PATCH 002/113] blk-wbt: Replace strlcpy with strscpy

strlcpy() reads the entire source buffer first.
This read may exceed the destination size limit.
This is both inefficient and can lead to linear read
overflows if a source string is not NUL-terminated [1].
In an effort to remove strlcpy() completely [2], replace
strlcpy() here with strscpy().
No return values were used, so direct replacement is safe.

[1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy
[2] https://github.com/KSPP/linux/issues/89

Signed-off-by: Azeem Shaikh <azeemshaikh38@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20230703172159.3668349-3-azeemshaikh38@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/wbt.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
index 9c66e59d859cb..4661f0d27062d 100644
--- a/include/trace/events/wbt.h
+++ b/include/trace/events/wbt.h
@@ -33,7 +33,7 @@ TRACE_EVENT(wbt_stat,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->name, bdi_dev_name(bdi),
+		strscpy(__entry->name, bdi_dev_name(bdi),
 			ARRAY_SIZE(__entry->name));
 		__entry->rmean		= stat[0].mean;
 		__entry->rmin		= stat[0].min;
@@ -68,7 +68,7 @@ TRACE_EVENT(wbt_lat,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->name, bdi_dev_name(bdi),
+		strscpy(__entry->name, bdi_dev_name(bdi),
 			ARRAY_SIZE(__entry->name));
 		__entry->lat = div_u64(lat, 1000);
 	),
@@ -105,7 +105,7 @@ TRACE_EVENT(wbt_step,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->name, bdi_dev_name(bdi),
+		strscpy(__entry->name, bdi_dev_name(bdi),
 			ARRAY_SIZE(__entry->name));
 		__entry->msg	= msg;
 		__entry->step	= step;
@@ -141,7 +141,7 @@ TRACE_EVENT(wbt_timer,
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->name, bdi_dev_name(bdi),
+		strscpy(__entry->name, bdi_dev_name(bdi),
 			ARRAY_SIZE(__entry->name));
 		__entry->status		= status;
 		__entry->step		= step;

From ffe357c868e7796f20bc0eac61f5b952731c0fa8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Thu, 13 Jul 2023 21:29:35 +0200
Subject: [PATCH 003/113] nbd: automatically load module on genl access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a module alias to nbd.ko that allows the generic netlink core to
automatically load the module when netlink messages for nbd are
received.

This frees the user from manually having to load the module before using
nbd functionality via netlink.
If the system policy allows it this can even be used to load the nbd
module from containers which would otherwise not have access to the
necessary module files to do a normal "modprobe nbd".

For example this avoids the following error when using nbd-client:

$ nbd-client localhost 10809 /dev/nbd0
...
Error: Couldn't resolve the nbd netlink family, make sure the nbd module is loaded and your nbd driver supports the netlink interface.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Josef Bacik <josef@toxicpadna.com>
Link: https://lore.kernel.org/r/20230713-b4-nbd-genl-v3-1-226cbddba04b@weissschuh.net
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 8576d696c7a22..a346dbd73543f 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -2336,6 +2336,7 @@ static struct genl_family nbd_genl_family __ro_after_init = {
 	.mcgrps		= nbd_mcast_grps,
 	.n_mcgrps	= ARRAY_SIZE(nbd_mcast_grps),
 };
+MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME);
 
 static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
 {

From c4e21bcd0f9d01f9c5d6c52007f5541871a5b1de Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 7 Jul 2023 11:42:38 +0200
Subject: [PATCH 004/113] block: cleanup queue_wc_store

Get rid of the local queue_wc_store variable and handling setting and
clearing the QUEUE_FLAG_WC flag diretly instead the if / else if.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230707094239.107968-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index afc797fb0dfc4..0cde6598fb2f4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -449,21 +449,13 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page)
 static ssize_t queue_wc_store(struct request_queue *q, const char *page,
 			      size_t count)
 {
-	int set = -1;
-
 	if (!strncmp(page, "write back", 10))
-		set = 1;
+		blk_queue_flag_set(QUEUE_FLAG_WC, q);
 	else if (!strncmp(page, "write through", 13) ||
 		 !strncmp(page, "none", 4))
-		set = 0;
-
-	if (set == -1)
-		return -EINVAL;
-
-	if (set)
-		blk_queue_flag_set(QUEUE_FLAG_WC, q);
-	else
 		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
+	else
+		return -EINVAL;
 
 	return count;
 }

From 43c9835b144c7ce29efe142d662529662a9eb376 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 7 Jul 2023 11:42:39 +0200
Subject: [PATCH 005/113] block: don't allow enabling a cache on devices that
 don't support it

Currently the write_cache attribute allows enabling the QUEUE_FLAG_WC
flag on devices that never claimed the capability.

Fix that by adding a QUEUE_FLAG_HW_WC flag that is set by
blk_queue_write_cache and guards re-enabling the cache through sysfs.

Note that any rescan that calls blk_queue_write_cache will still
re-enable the write cache as in the current code.

Fixes: 93e9d8e836cb ("block: add ability to flag write back caching on a device")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230707094239.107968-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c   |  7 +++++--
 block/blk-sysfs.c      | 11 +++++++----
 include/linux/blkdev.h |  1 +
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 4dd59059b788e..0046b447268f9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -830,10 +830,13 @@ EXPORT_SYMBOL(blk_set_queue_depth);
  */
 void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 {
-	if (wc)
+	if (wc) {
+		blk_queue_flag_set(QUEUE_FLAG_HW_WC, q);
 		blk_queue_flag_set(QUEUE_FLAG_WC, q);
-	else
+	} else {
+		blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q);
 		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
+	}
 	if (fua)
 		blk_queue_flag_set(QUEUE_FLAG_FUA, q);
 	else
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0cde6598fb2f4..63e4812623361 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -449,13 +449,16 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page)
 static ssize_t queue_wc_store(struct request_queue *q, const char *page,
 			      size_t count)
 {
-	if (!strncmp(page, "write back", 10))
+	if (!strncmp(page, "write back", 10)) {
+		if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags))
+			return -EINVAL;
 		blk_queue_flag_set(QUEUE_FLAG_WC, q);
-	else if (!strncmp(page, "write through", 13) ||
-		 !strncmp(page, "none", 4))
+	} else if (!strncmp(page, "write through", 13) ||
+		 !strncmp(page, "none", 4)) {
 		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
-	else
+	} else {
 		return -EINVAL;
+	}
 
 	return count;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ed44a997f629f..2f5371b8482c0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -538,6 +538,7 @@ struct request_queue {
 #define QUEUE_FLAG_ADD_RANDOM	10	/* Contributes to random pool */
 #define QUEUE_FLAG_SYNCHRONOUS	11	/* always completes in submit context */
 #define QUEUE_FLAG_SAME_FORCE	12	/* force complete on same CPU */
+#define QUEUE_FLAG_HW_WC	18	/* Write back caching supported */
 #define QUEUE_FLAG_INIT_DONE	14	/* queue is initialized */
 #define QUEUE_FLAG_STABLE_WRITES 15	/* don't modify blks until WB is done */
 #define QUEUE_FLAG_POLL		16	/* IO polling enabled if set */

From 660e802c76c89e871c29cd3174c07c8d23e39c35 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 17 Jul 2023 12:00:55 +0800
Subject: [PATCH 006/113] blk-mq: use percpu csd to remote complete instead of
 per-rq csd

If request need to be completed remotely, we insert it into percpu llist,
and smp_call_function_single_async() if llist is empty previously.

We don't need to use per-rq csd, percpu csd is enough. And the size of
struct request is decreased by 24 bytes.

This way is cleaner, and looks correct, given block softirq is guaranteed
to be scheduled to consume the list if one new request is added to this
percpu list, either smp_call_function_single_async() returns -EBUSY or 0.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230717040058.3993930-2-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 12 ++++++------
 include/linux/blk-mq.h |  5 +----
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d50b1d62a3d92..d986548696151 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -43,6 +43,7 @@
 #include "blk-ioprio.h"
 
 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
+static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
 
 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
 static void blk_mq_request_bypass_insert(struct request *rq,
@@ -1157,15 +1158,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
 
 static void blk_mq_complete_send_ipi(struct request *rq)
 {
-	struct llist_head *list;
 	unsigned int cpu;
 
 	cpu = rq->mq_ctx->cpu;
-	list = &per_cpu(blk_cpu_done, cpu);
-	if (llist_add(&rq->ipi_list, list)) {
-		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
-		smp_call_function_single_async(cpu, &rq->csd);
-	}
+	if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu)))
+		smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu));
 }
 
 static void blk_mq_raise_softirq(struct request *rq)
@@ -4829,6 +4826,9 @@ static int __init blk_mq_init(void)
 
 	for_each_possible_cpu(i)
 		init_llist_head(&per_cpu(blk_cpu_done, i));
+	for_each_possible_cpu(i)
+		INIT_CSD(&per_cpu(blk_cpu_csd, i),
+			 __blk_mq_complete_request_remote, NULL);
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
 
 	cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b96e00499f9ee..67f8108576340 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -182,10 +182,7 @@ struct request {
 		rq_end_io_fn		*saved_end_io;
 	} flush;
 
-	union {
-		struct __call_single_data csd;
-		u64 fifo_time;
-	};
+	u64 fifo_time;
 
 	/*
 	 * completion callback.

From 28b241237470981a96fbd82077c8044466b61e5f Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 17 Jul 2023 12:00:56 +0800
Subject: [PATCH 007/113] blk-flush: fix rq->flush.seq for post-flush requests

If the policy == (REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH), it means that the
data sequence and post-flush sequence need to be done for this request.

The rq->flush.seq should record what sequences have been done (or don't
need to be done). So in this case, pre-flush doesn't need to be done,
we should init rq->flush.seq to REQ_FSEQ_PREFLUSH not REQ_FSEQ_POSTFLUSH.

Fixes: 615939a2ae73 ("blk-mq: defer to the normal submission path for post-flush requests")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230717040058.3993930-3-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8220517c2d67d..fdc489e0ea162 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -443,7 +443,7 @@ bool blk_insert_flush(struct request *rq)
 		 * the post flush, and then just pass the command on.
 		 */
 		blk_rq_init_flush(rq);
-		rq->flush.seq |= REQ_FSEQ_POSTFLUSH;
+		rq->flush.seq |= REQ_FSEQ_PREFLUSH;
 		spin_lock_irq(&fq->mq_flush_lock);
 		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
 		spin_unlock_irq(&fq->mq_flush_lock);

From b175c86739d38e41044d3136065f092a6d95aee6 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 17 Jul 2023 12:00:57 +0800
Subject: [PATCH 008/113] blk-flush: count inflight flush_data requests

The flush state machine use a double list to link all inflight
flush_data requests, to avoid issuing separate post-flushes for
these flush_data requests which shared PREFLUSH.

So we can't reuse rq->queuelist, this is why we need rq->flush.list

In preparation of the next patch that reuse rq->queuelist for flush
state machine, we change the double linked list to unsigned long
counter, which count all inflight flush_data requests.

This is ok since we only need to know if there is any inflight
flush_data request, so unsigned long counter is good.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230717040058.3993930-4-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 9 +++++----
 block/blk.h       | 5 ++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index fdc489e0ea162..fedb39031647e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -187,7 +187,8 @@ static void blk_flush_complete_seq(struct request *rq,
 		break;
 
 	case REQ_FSEQ_DATA:
-		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
+		list_del_init(&rq->flush.list);
+		fq->flush_data_in_flight++;
 		spin_lock(&q->requeue_lock);
 		list_add(&rq->queuelist, &q->requeue_list);
 		spin_unlock(&q->requeue_lock);
@@ -299,7 +300,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 		return;
 
 	/* C2 and C3 */
-	if (!list_empty(&fq->flush_data_in_flight) &&
+	if (fq->flush_data_in_flight &&
 	    time_before(jiffies,
 			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
 		return;
@@ -374,6 +375,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
 	 * the comment in flush_end_io().
 	 */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
+	fq->flush_data_in_flight--;
 	blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
 	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 
@@ -445,7 +447,7 @@ bool blk_insert_flush(struct request *rq)
 		blk_rq_init_flush(rq);
 		rq->flush.seq |= REQ_FSEQ_PREFLUSH;
 		spin_lock_irq(&fq->mq_flush_lock);
-		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
+		fq->flush_data_in_flight++;
 		spin_unlock_irq(&fq->mq_flush_lock);
 		return false;
 	default:
@@ -496,7 +498,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
 
 	INIT_LIST_HEAD(&fq->flush_queue[0]);
 	INIT_LIST_HEAD(&fq->flush_queue[1]);
-	INIT_LIST_HEAD(&fq->flush_data_in_flight);
 
 	return fq;
 
diff --git a/block/blk.h b/block/blk.h
index 608c5dcc516b5..686712e138352 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -15,15 +15,14 @@ struct elevator_type;
 extern struct dentry *blk_debugfs_root;
 
 struct blk_flush_queue {
+	spinlock_t		mq_flush_lock;
 	unsigned int		flush_pending_idx:1;
 	unsigned int		flush_running_idx:1;
 	blk_status_t 		rq_status;
 	unsigned long		flush_pending_since;
 	struct list_head	flush_queue[2];
-	struct list_head	flush_data_in_flight;
+	unsigned long		flush_data_in_flight;
 	struct request		*flush_rq;
-
-	spinlock_t		mq_flush_lock;
 };
 
 bool is_flush_rq(struct request *req);

From 81ada09cc25e4bf2de7d2951925fb409338a545d Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 17 Jul 2023 12:00:58 +0800
Subject: [PATCH 009/113] blk-flush: reuse rq queuelist in flush state machine

Since we don't need to maintain inflight flush_data requests list
anymore, we can reuse rq->queuelist for flush pending list.

Note in mq_flush_data_end_io(), we need to re-initialize rq->queuelist
before reusing it in the state machine when end, since the rq->rq_next
also reuse it, may have corrupted rq->queuelist by the driver.

This patch decrease the size of struct request by 16 bytes.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230717040058.3993930-5-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c      | 17 ++++++++++-------
 include/linux/blk-mq.h |  1 -
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index fedb39031647e..e73dc22d05c1d 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -183,14 +183,13 @@ static void blk_flush_complete_seq(struct request *rq,
 		/* queue for flush */
 		if (list_empty(pending))
 			fq->flush_pending_since = jiffies;
-		list_move_tail(&rq->flush.list, pending);
+		list_move_tail(&rq->queuelist, pending);
 		break;
 
 	case REQ_FSEQ_DATA:
-		list_del_init(&rq->flush.list);
 		fq->flush_data_in_flight++;
 		spin_lock(&q->requeue_lock);
-		list_add(&rq->queuelist, &q->requeue_list);
+		list_move(&rq->queuelist, &q->requeue_list);
 		spin_unlock(&q->requeue_lock);
 		blk_mq_kick_requeue_list(q);
 		break;
@@ -202,7 +201,7 @@ static void blk_flush_complete_seq(struct request *rq,
 		 * flush data request completion path.  Restore @rq for
 		 * normal completion and end it.
 		 */
-		list_del_init(&rq->flush.list);
+		list_del_init(&rq->queuelist);
 		blk_flush_restore_request(rq);
 		blk_mq_end_request(rq, error);
 		break;
@@ -258,7 +257,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
 	fq->flush_running_idx ^= 1;
 
 	/* and push the waiting requests to the next stage */
-	list_for_each_entry_safe(rq, n, running, flush.list) {
+	list_for_each_entry_safe(rq, n, running, queuelist) {
 		unsigned int seq = blk_flush_cur_seq(rq);
 
 		BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
@@ -292,7 +291,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 {
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 	struct request *first_rq =
-		list_first_entry(pending, struct request, flush.list);
+		list_first_entry(pending, struct request, queuelist);
 	struct request *flush_rq = fq->flush_rq;
 
 	/* C1 described at the top of this file */
@@ -376,6 +375,11 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
 	 */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
 	fq->flush_data_in_flight--;
+	/*
+	 * May have been corrupted by rq->rq_next reuse, we need to
+	 * re-initialize rq->queuelist before reusing it here.
+	 */
+	INIT_LIST_HEAD(&rq->queuelist);
 	blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
 	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 
@@ -386,7 +390,6 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
 static void blk_rq_init_flush(struct request *rq)
 {
 	rq->flush.seq = 0;
-	INIT_LIST_HEAD(&rq->flush.list);
 	rq->rq_flags |= RQF_FLUSH_SEQ;
 	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
 	rq->end_io = mq_flush_data_end_io;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 67f8108576340..01e8c31db6658 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -178,7 +178,6 @@ struct request {
 
 	struct {
 		unsigned int		seq;
-		struct list_head	list;
 		rq_end_io_fn		*saved_end_io;
 	} flush;
 

From 8f63fef5867fb5e8c29d9c14b6d739bfc1869d32 Mon Sep 17 00:00:00 2001
From: Nitesh Shetty <nj.shetty@samsung.com>
Date: Wed, 19 Jul 2023 17:46:08 +0530
Subject: [PATCH 010/113] block: refactor to use helper

Reduce some code by making use of bio_integrity_bytes().

Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20230719121608.32105-1-nj.shetty@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4533eb4916610..8f0af7ac8573b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -199,7 +199,6 @@ bool bio_integrity_prep(struct bio *bio)
 	unsigned long start, end;
 	unsigned int len, nr_pages;
 	unsigned int bytes, offset, i;
-	unsigned int intervals;
 	blk_status_t status;
 
 	if (!bi)
@@ -224,10 +223,9 @@ bool bio_integrity_prep(struct bio *bio)
 		    !(bi->flags & BLK_INTEGRITY_GENERATE))
 			return true;
 	}
-	intervals = bio_integrity_intervals(bi, bio_sectors(bio));
 
 	/* Allocate kernel buffer for protection data */
-	len = intervals * bi->tuple_size;
+	len = bio_integrity_bytes(bi, bio_sectors(bio));
 	buf = kmalloc(len, GFP_NOIO);
 	status = BLK_STS_RESOURCE;
 	if (unlikely(buf == NULL)) {

From cd1d83e24e689f25de7e34bea697971750138d5f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jul 2023 09:54:26 -0700
Subject: [PATCH 011/113] block: tidy up the bio full checks in bio_add_hw_page

bio_add_hw_page already checks if the number of bytes trying to be added
even fit into max_hw_sectors limit of the queue.   Remove the call to
bio_full and just do a check for the smaller of the number of segments
in the bio and the queue max segments limit, and do this cheap check
before the more expensive gap to previous check.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jinyoung Choi <j-young.choi@samsung.com>
Link: https://lore.kernel.org/r/20230724165433.117645-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 8672179213b93..72488ecea47ac 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1014,6 +1014,10 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
 			return len;
 
+		if (bio->bi_vcnt >=
+		    min(bio->bi_max_vecs, queue_max_segments(q)))
+			return 0;
+
 		/*
 		 * If the queue doesn't support SG gaps and adding this segment
 		 * would create a gap, disallow it.
@@ -1023,12 +1027,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 			return 0;
 	}
 
-	if (bio_full(bio, len))
-		return 0;
-
-	if (bio->bi_vcnt >= queue_max_segments(q))
-		return 0;
-
 	bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
 	bio->bi_vcnt++;
 	bio->bi_iter.bi_size += len;

From 6850b2dd5c25f27f7b74414553f047d4c12dd66c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jul 2023 09:54:27 -0700
Subject: [PATCH 012/113] block: use SECTOR_SHIFT bio_add_hw_page

Use the SECTOR_SHIFT magic constant instead of the magic number.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jinyoung Choi <j-young.choi@samsung.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20230724165433.117645-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 72488ecea47ac..445be4bdd99bd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1007,7 +1007,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return 0;
 
-	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
+	if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors)
 		return 0;
 
 	if (bio->bi_vcnt > 0) {

From 939e1a370330841b2c0292a483d7b38f3ee45f88 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jul 2023 09:54:28 -0700
Subject: [PATCH 013/113] block: move the BIO_CLONED checks out of
 __bio_try_merge_page

__bio_try_merge_page is a way too low-level helper to assert that the
bio is not cloned.  Move the check into bio_add_page and
bio_iov_iter_get_pages instead, which are the high level entry points
that should enforce this variant.  bio_add_hw_page already this
check, coverig the third (indirect) caller of __bio_try_merge_page.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jinyoung Choi <j-young.choi@samsung.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20230724165433.117645-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 445be4bdd99bd..3ac72e60e7f11 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -945,9 +945,6 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
 static bool __bio_try_merge_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off, bool *same_page)
 {
-	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
-		return false;
-
 	if (bio->bi_vcnt > 0) {
 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
@@ -1127,6 +1124,9 @@ int bio_add_page(struct bio *bio, struct page *page,
 {
 	bool same_page = false;
 
+	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+		return 0;
+
 	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
 		if (bio_full(bio, len))
 			return 0;
@@ -1335,6 +1335,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
 	int ret = 0;
 
+	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+		return -EIO;
+
 	if (iov_iter_is_bvec(iter)) {
 		bio_iov_bvec_set(bio, iter);
 		iov_iter_advance(iter, bio->bi_iter.bi_size);

From 0eca8b6f97ac705c5806f7d062207379094fb114 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jul 2023 09:54:29 -0700
Subject: [PATCH 014/113] block: move the bi_vcnt check out of
 __bio_try_merge_page

Move the bi_vcnt out of __bio_try_merge_page and into the two callers
that don't already have it in preparation for additional changes to
__bio_try_merge_page.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jinyoung Choi <j-young.choi@samsung.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20230724165433.117645-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 45 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 3ac72e60e7f11..3dcbe98580dce 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -945,20 +945,17 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
 static bool __bio_try_merge_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off, bool *same_page)
 {
-	if (bio->bi_vcnt > 0) {
-		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
-		if (page_is_mergeable(bv, page, len, off, same_page)) {
-			if (bio->bi_iter.bi_size > UINT_MAX - len) {
-				*same_page = false;
-				return false;
-			}
-			bv->bv_len += len;
-			bio->bi_iter.bi_size += len;
-			return true;
-		}
+	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+	if (!page_is_mergeable(bv, page, len, off, same_page))
+		return false;
+	if (bio->bi_iter.bi_size > UINT_MAX - len) {
+		*same_page = false;
+		return false;
 	}
-	return false;
+	bv->bv_len += len;
+	bio->bi_iter.bi_size += len;
+	return true;
 }
 
 /*
@@ -1127,11 +1124,13 @@ int bio_add_page(struct bio *bio, struct page *page,
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return 0;
 
-	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
-		if (bio_full(bio, len))
-			return 0;
-		__bio_add_page(bio, page, len, offset);
-	}
+	if (bio->bi_vcnt > 0 &&
+	    __bio_try_merge_page(bio, page, len, offset, &same_page))
+		return len;
+
+	if (bio_full(bio, len))
+		return 0;
+	__bio_add_page(bio, page, len, offset);
 	return len;
 }
 EXPORT_SYMBOL(bio_add_page);
@@ -1205,13 +1204,13 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
 {
 	bool same_page = false;
 
-	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
-		__bio_add_page(bio, page, len, offset);
+	if (bio->bi_vcnt > 0 &&
+	    __bio_try_merge_page(bio, page, len, offset, &same_page)) {
+		if (same_page)
+			bio_release_page(bio, page);
 		return 0;
 	}
-
-	if (same_page)
-		bio_release_page(bio, page);
+	__bio_add_page(bio, page, len, offset);
 	return 0;
 }
 

From 613699050a49760f1d70c74f71bd0b013ca3c356 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jul 2023 09:54:30 -0700
Subject: [PATCH 015/113] block: move the bi_size overflow check in
 __bio_try_merge_page

Checking for availability in bi_size in a function that attempts to
merge into an existing segment is a bit odd, as the limit also applies
when adding a new segment.  This code works fine as we always call
__bio_try_merge_page, but contributes to sub-optimal calling conventions
and doesn't lead to clear code.

Move it to two of the callers instead, the third one already has a more
strict check that includes max_hw_segments anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jinyoung Choi <j-young.choi@samsung.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20230724165433.117645-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 3dcbe98580dce..17f57fd2cff2f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -949,10 +949,6 @@ static bool __bio_try_merge_page(struct bio *bio, struct page *page,
 
 	if (!page_is_mergeable(bv, page, len, off, same_page))
 		return false;
-	if (bio->bi_iter.bi_size > UINT_MAX - len) {
-		*same_page = false;
-		return false;
-	}
 	bv->bv_len += len;
 	bio->bi_iter.bi_size += len;
 	return true;
@@ -1123,6 +1119,8 @@ int bio_add_page(struct bio *bio, struct page *page,
 
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return 0;
+	if (bio->bi_iter.bi_size > UINT_MAX - len)
+		return 0;
 
 	if (bio->bi_vcnt > 0 &&
 	    __bio_try_merge_page(bio, page, len, offset, &same_page))
@@ -1204,6 +1202,9 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
 {
 	bool same_page = false;
 
+	if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
+		return -EIO;
+
 	if (bio->bi_vcnt > 0 &&
 	    __bio_try_merge_page(bio, page, len, offset, &same_page)) {
 		if (same_page)

From 80232b520314214d846eb0a65faef8b51b702fa7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jul 2023 09:54:31 -0700
Subject: [PATCH 016/113] block: downgrade a bio_full call in bio_add_page

bio_add_page already checks that there is space in bi_size a little
earlier.  So after we failed to add to an existing segment, just check
that there is another one available instead of duplicating the bi_size
check.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jinyoung Choi <j-young.choi@samsung.com>
Link: https://lore.kernel.org/r/20230724165433.117645-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 17f57fd2cff2f..d8e0e8de8cf4f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1126,7 +1126,7 @@ int bio_add_page(struct bio *bio, struct page *page,
 	    __bio_try_merge_page(bio, page, len, offset, &same_page))
 		return len;
 
-	if (bio_full(bio, len))
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return 0;
 	__bio_add_page(bio, page, len, offset);
 	return len;

From 858c708d9efb7e8e5c6320793b778cc17cf8368a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jul 2023 09:54:32 -0700
Subject: [PATCH 017/113] block: move the bi_size update out of
 __bio_try_merge_page

The update of bi_size is the only thing in __bio_try_merge_page that
needs a bio.  Move it to the callers, and merge __bio_try_merge_page
and page_is_mergeable into a single bvec_try_merge_page that only takes
the current bvec instead of a full bio.  This will allow reusing this
function for supporting multi-page integrity payload bvecs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jinyoung Choi <j-young.choi@samsung.com>
Link: https://lore.kernel.org/r/20230724165433.117645-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 57 +++++++++++++++++++----------------------------------
 1 file changed, 20 insertions(+), 37 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index d8e0e8de8cf4f..23b7a001b5005 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -903,9 +903,8 @@ static inline bool bio_full(struct bio *bio, unsigned len)
 	return false;
 }
 
-static inline bool page_is_mergeable(const struct bio_vec *bv,
-		struct page *page, unsigned int len, unsigned int off,
-		bool *same_page)
+static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
+		unsigned int len, unsigned int off, bool *same_page)
 {
 	size_t bv_end = bv->bv_offset + bv->bv_len;
 	phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
@@ -919,38 +918,14 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
 		return false;
 
 	*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
-	if (*same_page)
-		return true;
-	else if (IS_ENABLED(CONFIG_KMSAN))
-		return false;
-	return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
-}
-
-/**
- * __bio_try_merge_page - try appending data to an existing bvec.
- * @bio: destination bio
- * @page: start page to add
- * @len: length of the data to add
- * @off: offset of the data relative to @page
- * @same_page: return if the segment has been merged inside the same page
- *
- * Try to add the data at @page + @off to the last bvec of @bio.  This is a
- * useful optimisation for file systems with a block size smaller than the
- * page size.
- *
- * Warn if (@len, @off) crosses pages in case that @same_page is true.
- *
- * Return %true on success or %false on failure.
- */
-static bool __bio_try_merge_page(struct bio *bio, struct page *page,
-		unsigned int len, unsigned int off, bool *same_page)
-{
-	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+	if (!*same_page) {
+		if (IS_ENABLED(CONFIG_KMSAN))
+			return false;
+		if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
+			return false;
+	}
 
-	if (!page_is_mergeable(bv, page, len, off, same_page))
-		return false;
 	bv->bv_len += len;
-	bio->bi_iter.bi_size += len;
 	return true;
 }
 
@@ -972,7 +947,7 @@ static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
 		return false;
 	if (bv->bv_len + len > queue_max_segment_size(q))
 		return false;
-	return __bio_try_merge_page(bio, page, len, offset, same_page);
+	return bvec_try_merge_page(bv, page, len, offset, same_page);
 }
 
 /**
@@ -1001,8 +976,11 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		return 0;
 
 	if (bio->bi_vcnt > 0) {
-		if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
+		if (bio_try_merge_hw_seg(q, bio, page, len, offset,
+				same_page)) {
+			bio->bi_iter.bi_size += len;
 			return len;
+		}
 
 		if (bio->bi_vcnt >=
 		    min(bio->bi_max_vecs, queue_max_segments(q)))
@@ -1123,8 +1101,11 @@ int bio_add_page(struct bio *bio, struct page *page,
 		return 0;
 
 	if (bio->bi_vcnt > 0 &&
-	    __bio_try_merge_page(bio, page, len, offset, &same_page))
+	    bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
+				page, len, offset, &same_page)) {
+		bio->bi_iter.bi_size += len;
 		return len;
+	}
 
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return 0;
@@ -1206,7 +1187,9 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
 		return -EIO;
 
 	if (bio->bi_vcnt > 0 &&
-	    __bio_try_merge_page(bio, page, len, offset, &same_page)) {
+	    bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
+				page, len, offset, &same_page)) {
+		bio->bi_iter.bi_size += len;
 		if (same_page)
 			bio_release_page(bio, page);
 		return 0;

From ae42f0b3bf65912e122fc2e8d5f6d94b51156dba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jul 2023 09:54:33 -0700
Subject: [PATCH 018/113] block: don't pass a bio to bio_try_merge_hw_seg

There is no good reason to pass the bio to bio_try_merge_hw_seg.  Just
pass the current bvec and rename the function to bvec_try_merge_hw_page.
This will allow reusing this function for supporting multi-page integrity
payload bvecs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jinyoung Choi <j-young.choi@samsung.com>
Link: https://lore.kernel.org/r/20230724165433.117645-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 23b7a001b5005..c92dda962449b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -934,11 +934,10 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
  * size limit.  This is not for normal read/write bios, but for passthrough
  * or Zone Append operations that we can't split.
  */
-static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
-				 struct page *page, unsigned len,
-				 unsigned offset, bool *same_page)
+static bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
+		struct page *page, unsigned len, unsigned offset,
+		bool *same_page)
 {
-	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 	unsigned long mask = queue_segment_boundary(q);
 	phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
 	phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
@@ -967,8 +966,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
 		unsigned int max_sectors, bool *same_page)
 {
-	struct bio_vec *bvec;
-
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return 0;
 
@@ -976,7 +973,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		return 0;
 
 	if (bio->bi_vcnt > 0) {
-		if (bio_try_merge_hw_seg(q, bio, page, len, offset,
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		if (bvec_try_merge_hw_page(q, bv, page, len, offset,
 				same_page)) {
 			bio->bi_iter.bi_size += len;
 			return len;
@@ -990,8 +989,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		 * If the queue doesn't support SG gaps and adding this segment
 		 * would create a gap, disallow it.
 		 */
-		bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
-		if (bvec_gap_to_prev(&q->limits, bvec, offset))
+		if (bvec_gap_to_prev(&q->limits, bv, offset))
 			return 0;
 	}
 

From b5ca9acff553874aaf1faf176e076cbd7cc4aa0e Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 21 Jul 2023 10:27:28 -0700
Subject: [PATCH 019/113] scsi: Inline scsi_kick_queue()

Inline scsi_kick_queue() to prepare for modifying the second argument
passed to blk_mq_run_hw_queues().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20230721172731.955724-2-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ad9afae49544a..414d29eef9686 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -300,11 +300,6 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
 	cmd->budget_token = -1;
 }
 
-static void scsi_kick_queue(struct request_queue *q)
-{
-	blk_mq_run_hw_queues(q, false);
-}
-
 /*
  * Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with
  * interrupts disabled.
@@ -340,7 +335,7 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev)
 	 * but in most cases, we will be first. Ideally, each LU on the
 	 * target would get some limited time or requests on the target.
 	 */
-	scsi_kick_queue(current_sdev->request_queue);
+	blk_mq_run_hw_queues(current_sdev->request_queue, false);
 
 	spin_lock_irqsave(shost->host_lock, flags);
 	if (!starget->starget_sdev_user)
@@ -427,7 +422,7 @@ static void scsi_starved_list_run(struct Scsi_Host *shost)
 			continue;
 		spin_unlock_irqrestore(shost->host_lock, flags);
 
-		scsi_kick_queue(slq);
+		blk_mq_run_hw_queues(slq, false);
 		blk_put_queue(slq);
 
 		spin_lock_irqsave(shost->host_lock, flags);

From d42e2e3448a99c41c8489766eeb732d8d741d5be Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 21 Jul 2023 10:27:29 -0700
Subject: [PATCH 020/113] scsi: Remove a blk_mq_run_hw_queues() call

blk_mq_kick_requeue_list() calls blk_mq_run_hw_queues() asynchronously.
Leave out the direct blk_mq_run_hw_queues() call. This patch causes
scsi_run_queue() to call blk_mq_run_hw_queues() asynchronously instead
of synchronously. Since scsi_run_queue() is not called from the hot I/O
submission path, this patch does not affect the hot path.

This patch prepares for allowing blk_mq_run_hw_queue() to sleep if
BLK_MQ_F_BLOCKING has been set. scsi_run_queue() may be called from
atomic context and must not sleep. Hence the removal of the
blk_mq_run_hw_queues(q, false) call. See also scsi_unblock_requests().

Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20230721172731.955724-3-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 414d29eef9686..d4c514ab9fe87 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -447,8 +447,8 @@ static void scsi_run_queue(struct request_queue *q)
 	if (!list_empty(&sdev->host->starved_list))
 		scsi_starved_list_run(sdev->host);
 
+	/* Note: blk_mq_kick_requeue_list() runs the queue asynchronously. */
 	blk_mq_kick_requeue_list(q);
-	blk_mq_run_hw_queues(q, false);
 }
 
 void scsi_requeue_run_queue(struct work_struct *work)

From 65a558f66c308251e256317957b75d1e643c33c3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 21 Jul 2023 10:27:30 -0700
Subject: [PATCH 021/113] block: Improve performance for BLK_MQ_F_BLOCKING
 drivers

blk_mq_run_queue() runs the queue asynchronously if BLK_MQ_F_BLOCKING
has been set. This is suboptimal since running the queue asynchronously
is slower than running the queue synchronously. This patch modifies
blk_mq_run_queue() as follows if BLK_MQ_F_BLOCKING has been set:
- Run the queue synchronously if it is allowed to sleep.
- Run the queue asynchronously if it is not allowed to sleep.
Additionally, blk_mq_run_hw_queue(hctx, false) calls are modified into
blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING) if the caller
may be invoked from atomic context.

The following caller chains have been reviewed:

blk_mq_run_hw_queue(hctx, false)
  blk_mq_get_tag()      /* may sleep, hence the functions it calls may also sleep */
  blk_execute_rq()             /* may sleep */
  blk_mq_run_hw_queues(q, async=false)
    blk_freeze_queue_start()   /* may sleep */
    blk_mq_requeue_work()      /* may sleep */
    scsi_kick_queue()
      scsi_requeue_run_queue() /* may sleep */
      scsi_run_host_queues()
        scsi_ioctl_reset()     /* may sleep */
  blk_mq_insert_requests(hctx, ctx, list, run_queue_async=false)
    blk_mq_dispatch_plug_list(plug, from_sched=false)
      blk_mq_flush_plug_list(plug, from_schedule=false)
        __blk_flush_plug(plug, from_schedule=false)
	blk_add_rq_to_plug()
	  blk_mq_submit_bio()  /* may sleep if REQ_NOWAIT has not been set */
  blk_mq_plug_issue_direct()
    blk_mq_flush_plug_list()   /* see above */
  blk_mq_dispatch_plug_list(plug, from_sched=false)
    blk_mq_flush_plug_list()   /* see above */
  blk_mq_try_issue_directly()
    blk_mq_submit_bio()        /* may sleep if REQ_NOWAIT has not been set */
  blk_mq_try_issue_list_directly(hctx, list)
    blk_mq_insert_requests() /* see above */

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20230721172731.955724-4-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c          | 16 ++++++++++------
 drivers/scsi/scsi_lib.c |  3 ++-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d986548696151..687ec3f4f10d2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1323,7 +1323,7 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
 	}
 
 	blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
-	blk_mq_run_hw_queue(hctx, false);
+	blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
 
@@ -2222,6 +2222,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	 */
 	WARN_ON_ONCE(!async && in_interrupt());
 
+	might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
+
 	/*
 	 * When queue is quiesced, we may be switching io scheduler, or
 	 * updating nr_hw_queues, or other things, and we can't run queue
@@ -2237,8 +2239,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	if (!need_run)
 		return;
 
-	if (async || (hctx->flags & BLK_MQ_F_BLOCKING) ||
-	    !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
+	if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
 		blk_mq_delay_run_hw_queue(hctx, 0);
 		return;
 	}
@@ -2373,7 +2374,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 
-	blk_mq_run_hw_queue(hctx, false);
+	blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
@@ -2403,7 +2404,8 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
-		blk_mq_start_stopped_hw_queue(hctx, async);
+		blk_mq_start_stopped_hw_queue(hctx, async ||
+					(hctx->flags & BLK_MQ_F_BLOCKING));
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 
@@ -2461,6 +2463,8 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
 	list_for_each_entry(rq, list, queuelist) {
 		BUG_ON(rq->mq_ctx != ctx);
 		trace_block_rq_insert(rq);
+		if (rq->cmd_flags & REQ_NOWAIT)
+			run_queue_async = true;
 	}
 
 	spin_lock(&ctx->lock);
@@ -2621,7 +2625,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 
 	if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
 		blk_mq_insert_request(rq, 0);
-		blk_mq_run_hw_queue(hctx, false);
+		blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT);
 		return;
 	}
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d4c514ab9fe87..59176946ab560 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -335,7 +335,8 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev)
 	 * but in most cases, we will be first. Ideally, each LU on the
 	 * target would get some limited time or requests on the target.
 	 */
-	blk_mq_run_hw_queues(current_sdev->request_queue, false);
+	blk_mq_run_hw_queues(current_sdev->request_queue,
+			     shost->queuecommand_may_block);
 
 	spin_lock_irqsave(shost->host_lock, flags);
 	if (!starget->starget_sdev_user)

From 51d74ec9b62f5813767a60226acaf943e26e7d7a Mon Sep 17 00:00:00 2001
From: Jinyoung Choi <j-young.choi@samsung.com>
Date: Tue, 25 Jul 2023 14:18:39 +0900
Subject: [PATCH 022/113] block: cleanup bio_integrity_prep

If a problem occurs in the process of creating an integrity payload, the
status of bio is always BLK_STS_RESOURCE.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jinyoung Choi <j-young.choi@samsung.com>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20230725051839epcms2p8e4d20ad6c51326ad032e8406f59d0aaa@epcms2p8
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 8f0af7ac8573b..045553a164e0c 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -199,7 +199,6 @@ bool bio_integrity_prep(struct bio *bio)
 	unsigned long start, end;
 	unsigned int len, nr_pages;
 	unsigned int bytes, offset, i;
-	blk_status_t status;
 
 	if (!bi)
 		return true;
@@ -227,7 +226,6 @@ bool bio_integrity_prep(struct bio *bio)
 	/* Allocate kernel buffer for protection data */
 	len = bio_integrity_bytes(bi, bio_sectors(bio));
 	buf = kmalloc(len, GFP_NOIO);
-	status = BLK_STS_RESOURCE;
 	if (unlikely(buf == NULL)) {
 		printk(KERN_ERR "could not allocate integrity buffer\n");
 		goto err_end_io;
@@ -242,7 +240,6 @@ bool bio_integrity_prep(struct bio *bio)
 	if (IS_ERR(bip)) {
 		printk(KERN_ERR "could not allocate data integrity bioset\n");
 		kfree(buf);
-		status = BLK_STS_RESOURCE;
 		goto err_end_io;
 	}
 
@@ -270,7 +267,6 @@ bool bio_integrity_prep(struct bio *bio)
 
 		if (ret == 0) {
 			printk(KERN_ERR "could not attach integrity payload\n");
-			status = BLK_STS_RESOURCE;
 			goto err_end_io;
 		}
 
@@ -292,7 +288,7 @@ bool bio_integrity_prep(struct bio *bio)
 	return true;
 
 err_end_io:
-	bio->bi_status = status;
+	bio->bi_status = BLK_STS_RESOURCE;
 	bio_endio(bio);
 	return false;
 

From a865b96c513bcaeec49669010d67c40aa8e58619 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:32 +0800
Subject: [PATCH 023/113] Revert "md: unlock mddev before reap sync_thread in
 action_store"

This reverts commit 9dfbdafda3b34e262e43e786077bab8e476a89d1.

Because it will introduce a defect that sync_thread can be running while
MD_RECOVERY_RUNNING is cleared, which will cause some unexpected problems,
for example:

list_add corruption. prev->next should be next (ffff0001ac1daba0), but was ffff0000ce1a02a0. (prev=ffff0000ce1a02a0).
Call trace:
 __list_add_valid+0xfc/0x140
 insert_work+0x78/0x1a0
 __queue_work+0x500/0xcf4
 queue_work_on+0xe8/0x12c
 md_check_recovery+0xa34/0xf30
 raid10d+0xb8/0x900 [raid10]
 md_thread+0x16c/0x2cc
 kthread+0x1a4/0x1ec
 ret_from_fork+0x10/0x18

This is because work is requeued while it's still inside workqueue:

t1:			t2:
action_store
 mddev_lock
  if (mddev->sync_thread)
   mddev_unlock
   md_unregister_thread
   // first sync_thread is done
			md_check_recovery
			 mddev_try_lock
			 /*
			  * once MD_RECOVERY_DONE is set, new sync_thread
			  * can start.
			  */
			 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery)
			 INIT_WORK(&mddev->del_work, md_start_sync)
			 queue_work(md_misc_wq, &mddev->del_work)
			  test_and_set_bit(WORK_STRUCT_PENDING_BIT, ...)
			  // set pending bit
			  insert_work
			   list_add_tail
			 mddev_unlock
   mddev_lock_nointr
   md_reap_sync_thread
   // MD_RECOVERY_RUNNING is cleared
 mddev_unlock

t3:

// before queued work started from t2
md_check_recovery
 // MD_RECOVERY_RUNNING is not set, a new sync_thread can be started
 INIT_WORK(&mddev->del_work, md_start_sync)
  work->data = 0
  // work pending bit is cleared
 queue_work(md_misc_wq, &mddev->del_work)
  insert_work
   list_add_tail
   // list is corrupted

The above commit is reverted to fix the problem, the deadlock this
commit tries to fix will be fixed in following patches.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-2-yukuai1@huaweicloud.com
---
 drivers/md/dm-raid.c |  1 -
 drivers/md/md.c      | 19 ++-----------------
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 8846bf510a35e..1f22bef278418 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3725,7 +3725,6 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
 	if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			md_unregister_thread(&mddev->sync_thread);
 			md_reap_sync_thread(mddev);
 		}
 	} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2e38ef421d69f..d445d5fb7a01c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4764,19 +4764,6 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 			if (work_pending(&mddev->del_work))
 				flush_workqueue(md_misc_wq);
 			if (mddev->sync_thread) {
-				sector_t save_rp = mddev->reshape_position;
-
-				mddev_unlock(mddev);
-				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-				md_unregister_thread(&mddev->sync_thread);
-				mddev_lock_nointr(mddev);
-				/*
-				 * set RECOVERY_INTR again and restore reshape
-				 * position in case others changed them after
-				 * got lock, eg, reshape_position_store and
-				 * md_check_recovery.
-				 */
-				mddev->reshape_position = save_rp;
 				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 				md_reap_sync_thread(mddev);
 			}
@@ -6176,7 +6163,6 @@ static void __md_stop_writes(struct mddev *mddev)
 		flush_workqueue(md_misc_wq);
 	if (mddev->sync_thread) {
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-		md_unregister_thread(&mddev->sync_thread);
 		md_reap_sync_thread(mddev);
 	}
 
@@ -9327,7 +9313,6 @@ void md_check_recovery(struct mddev *mddev)
 			 * ->spare_active and clear saved_raid_disk
 			 */
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			md_unregister_thread(&mddev->sync_thread);
 			md_reap_sync_thread(mddev);
 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9363,7 +9348,6 @@ void md_check_recovery(struct mddev *mddev)
 			goto unlock;
 		}
 		if (mddev->sync_thread) {
-			md_unregister_thread(&mddev->sync_thread);
 			md_reap_sync_thread(mddev);
 			goto unlock;
 		}
@@ -9443,7 +9427,8 @@ void md_reap_sync_thread(struct mddev *mddev)
 	sector_t old_dev_sectors = mddev->dev_sectors;
 	bool is_reshaped = false;
 
-	/* sync_thread should be unregistered, collect result */
+	/* resync has finished, collect result */
+	md_unregister_thread(&mddev->sync_thread);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 	    mddev->degraded != mddev->raid_disks) {

From 64e5e09afc14f8cc9058b0ed5c9cc4c8cd126b85 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:33 +0800
Subject: [PATCH 024/113] md: refactor action_store() for 'idle' and 'frozen'

Prepare to handle 'idle' and 'frozen' differently to fix a deadlock, there
are no functional changes except that MD_RECOVERY_RUNNING is checked
again after 'reconfig_mutex' is held.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-3-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 61 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d445d5fb7a01c..7fa91f0e56202 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4747,6 +4747,46 @@ action_show(struct mddev *mddev, char *page)
 	return sprintf(page, "%s\n", type);
 }
 
+static void stop_sync_thread(struct mddev *mddev)
+{
+	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return;
+
+	if (mddev_lock(mddev))
+		return;
+
+	/*
+	 * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
+	 * held.
+	 */
+	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+		mddev_unlock(mddev);
+		return;
+	}
+
+	if (work_pending(&mddev->del_work))
+		flush_workqueue(md_misc_wq);
+
+	if (mddev->sync_thread) {
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		md_reap_sync_thread(mddev);
+	}
+
+	mddev_unlock(mddev);
+}
+
+static void idle_sync_thread(struct mddev *mddev)
+{
+	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	stop_sync_thread(mddev);
+}
+
+static void frozen_sync_thread(struct mddev *mddev)
+{
+	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	stop_sync_thread(mddev);
+}
+
 static ssize_t
 action_store(struct mddev *mddev, const char *page, size_t len)
 {
@@ -4754,22 +4794,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 		return -EINVAL;
 
 
-	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
-		if (cmd_match(page, "frozen"))
-			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-		else
-			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
-		    mddev_lock(mddev) == 0) {
-			if (work_pending(&mddev->del_work))
-				flush_workqueue(md_misc_wq);
-			if (mddev->sync_thread) {
-				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-				md_reap_sync_thread(mddev);
-			}
-			mddev_unlock(mddev);
-		}
-	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+	if (cmd_match(page, "idle"))
+		idle_sync_thread(mddev);
+	else if (cmd_match(page, "frozen"))
+		frozen_sync_thread(mddev);
+	else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return -EBUSY;
 	else if (cmd_match(page, "resync"))
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);

From 6f56f0c4f1241f1694a6a9438dd4f78d4513a917 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:34 +0800
Subject: [PATCH 025/113] md: add a mutex to synchronize idle and frozen in
 action_store()

Currently, for idle and frozen, action_store will hold 'reconfig_mutex'
and call md_reap_sync_thread() to stop sync thread, however, this will
cause deadlock (explained in the next patch). In order to fix the
problem, following patch will release 'reconfig_mutex' and wait on
'resync_wait', like md_set_readonly() and do_md_stop() does.

Consider that action_store() will set/clear 'MD_RECOVERY_FROZEN'
unconditionally, which might cause unexpected problems, for example,
frozen just set 'MD_RECOVERY_FROZEN' and is still in progress, while
'idle' clear 'MD_RECOVERY_FROZEN' and new sync thread is started, which
might starve in progress frozen. A mutex is added to synchronize idle
and frozen from action_store().

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-4-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 5 +++++
 drivers/md/md.h | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7fa91f0e56202..3d7e87cab8ad6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -643,6 +643,7 @@ void mddev_init(struct mddev *mddev)
 {
 	mutex_init(&mddev->open_mutex);
 	mutex_init(&mddev->reconfig_mutex);
+	mutex_init(&mddev->sync_mutex);
 	mutex_init(&mddev->bitmap_info.mutex);
 	INIT_LIST_HEAD(&mddev->disks);
 	INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -4777,14 +4778,18 @@ static void stop_sync_thread(struct mddev *mddev)
 
 static void idle_sync_thread(struct mddev *mddev)
 {
+	mutex_lock(&mddev->sync_mutex);
 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	stop_sync_thread(mddev);
+	mutex_unlock(&mddev->sync_mutex);
 }
 
 static void frozen_sync_thread(struct mddev *mddev)
 {
+	mutex_lock(&mddev->sync_mutex);
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	stop_sync_thread(mddev);
+	mutex_unlock(&mddev->sync_mutex);
 }
 
 static ssize_t
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1aef86bf3fc31..18c168bf5fabe 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -535,6 +535,9 @@ struct mddev {
 	 */
 	struct list_head		deleting;
 
+	/* Used to synchronize idle and frozen for action_store() */
+	struct mutex			sync_mutex;
+
 	bool	has_superblocks:1;
 	bool	fail_last_dev:1;
 	bool	serialize_policy:1;

From 130443d60b1b8c7a609a2af3384dd8e60df97181 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:35 +0800
Subject: [PATCH 026/113] md: refactor idle/frozen_sync_thread() to fix
 deadlock

Our test found a following deadlock in raid10:

1) Issue a normal write, and such write failed:

  raid10_end_write_request
   set_bit(R10BIO_WriteError, &r10_bio->state)
   one_write_done
    reschedule_retry

  // later from md thread
  raid10d
   handle_write_completed
    list_add(&r10_bio->retry_list, &conf->bio_end_io_list)

  // later from md thread
  raid10d
   if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
    list_move(conf->bio_end_io_list.prev, &tmp)
    r10_bio = list_first_entry(&tmp, struct r10bio, retry_list)
    raid_end_bio_io(r10_bio)

Dependency chain 1: normal io is waiting for updating superblock

2) Trigger a recovery:

  raid10_sync_request
   raise_barrier

Dependency chain 2: sync thread is waiting for normal io

3) echo idle/frozen to sync_action:

  action_store
   mddev_lock
    md_unregister_thread
     kthread_stop

Dependency chain 3: drop 'reconfig_mutex' is waiting for sync thread

4) md thread can't update superblock:

  raid10d
   md_check_recovery
    if (mddev_trylock(mddev))
     md_update_sb

Dependency chain 4: update superblock is waiting for 'reconfig_mutex'

Hence cyclic dependency exist, in order to fix the problem, we must
break one of them. Dependency 1 and 2 can't be broken because they are
foundation design. Dependency 4 may be possible if it can be guaranteed
that no io can be inflight, however, this requires a new mechanism which
seems complex. Dependency 3 is a good choice, because idle/frozen only
requires sync thread to finish, which can be done asynchronously that is
already implemented, and 'reconfig_mutex' is not needed anymore.

This patch switch 'idle' and 'frozen' to wait sync thread to be done
asynchronously, and this patch also add a sequence counter to record how
many times sync thread is done, so that 'idle' won't keep waiting on new
started sync thread.

Noted that raid456 has similiar deadlock([1]), and it's verified[2] this
deadlock can be fixed by this patch as well.

[1] https://lore.kernel.org/linux-raid/5ed54ffc-ce82-bf66-4eff-390cb23bc1ac@molgen.mpg.de/T/#t
[2] https://lore.kernel.org/linux-raid/e9067438-d713-f5f3-0d3d-9e6b0e9efa0e@huaweicloud.com/

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-5-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 23 +++++++++++++++++++----
 drivers/md/md.h |  2 ++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3d7e87cab8ad6..920701ab95054 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -651,6 +651,7 @@ void mddev_init(struct mddev *mddev)
 	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
 	atomic_set(&mddev->active, 1);
 	atomic_set(&mddev->openers, 0);
+	atomic_set(&mddev->sync_seq, 0);
 	spin_lock_init(&mddev->lock);
 	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
@@ -4768,19 +4769,27 @@ static void stop_sync_thread(struct mddev *mddev)
 	if (work_pending(&mddev->del_work))
 		flush_workqueue(md_misc_wq);
 
-	if (mddev->sync_thread) {
-		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-		md_reap_sync_thread(mddev);
-	}
+	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+	/*
+	 * Thread might be blocked waiting for metadata update which will now
+	 * never happen
+	 */
+	md_wakeup_thread_directly(mddev->sync_thread);
 
 	mddev_unlock(mddev);
 }
 
 static void idle_sync_thread(struct mddev *mddev)
 {
+	int sync_seq = atomic_read(&mddev->sync_seq);
+
 	mutex_lock(&mddev->sync_mutex);
 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	stop_sync_thread(mddev);
+
+	wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
+			!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+
 	mutex_unlock(&mddev->sync_mutex);
 }
 
@@ -4789,6 +4798,10 @@ static void frozen_sync_thread(struct mddev *mddev)
 	mutex_lock(&mddev->sync_mutex);
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	stop_sync_thread(mddev);
+
+	wait_event(resync_wait, mddev->sync_thread == NULL &&
+			!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+
 	mutex_unlock(&mddev->sync_mutex);
 }
 
@@ -9463,6 +9476,8 @@ void md_reap_sync_thread(struct mddev *mddev)
 
 	/* resync has finished, collect result */
 	md_unregister_thread(&mddev->sync_thread);
+	atomic_inc(&mddev->sync_seq);
+
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 	    mddev->degraded != mddev->raid_disks) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 18c168bf5fabe..914e6ece9af29 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -537,6 +537,8 @@ struct mddev {
 
 	/* Used to synchronize idle and frozen for action_store() */
 	struct mutex			sync_mutex;
+	/* The sequence number for sync thread */
+	atomic_t sync_seq;
 
 	bool	has_superblocks:1;
 	bool	fail_last_dev:1;

From 753260ed0b46d2ba0d3d6f68a6a49187bff443e4 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:36 +0800
Subject: [PATCH 027/113] md: wake up 'resync_wait' at last in
 md_reap_sync_thread()

md_reap_sync_thread() is just replaced with wait_event(resync_wait, ...)
from action_store(), just make sure action_store() will still wait for
everything to be done in md_reap_sync_thread().

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewd-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-6-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 920701ab95054..e0d8e751a7828 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9522,7 +9522,6 @@ void md_reap_sync_thread(struct mddev *mddev)
 	if (mddev_is_clustered(mddev) && is_reshaped
 				      && !test_bit(MD_CLOSING, &mddev->flags))
 		md_cluster_ops->update_size(mddev, old_dev_sectors);
-	wake_up(&resync_wait);
 	/* flag recovery needed just to double check */
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9530,6 +9529,7 @@ void md_reap_sync_thread(struct mddev *mddev)
 	md_new_event();
 	if (mddev->event_work.func)
 		queue_work(md_misc_wq, &mddev->event_work);
+	wake_up(&resync_wait);
 }
 EXPORT_SYMBOL(md_reap_sync_thread);
 

From f71209b1f21c838a973d858d9f6f76cd39227733 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:37 +0800
Subject: [PATCH 028/113] md: enhance checking in md_check_recovery()

For md_check_recovery():

1) if 'MD_RECOVERY_RUNING' is not set, register new sync_thread.
2) if 'MD_RECOVERY_RUNING' is set:
 a) if 'MD_RECOVERY_DONE' is not set, don't do anything, wait for
   md_do_sync() to be done.
 b) if 'MD_RECOVERY_DONE' is set, unregister sync_thread. Current code
   expects that sync_thread is not NULL, otherwise new sync_thread will
   be registered, which will corrupt the array.

Make sure md_check_recovery() won't register new sync_thread if
'MD_RECOVERY_RUNING' is still set, and a new WARN_ON_ONCE() is added for
the above corruption,

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-7-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e0d8e751a7828..320d71537359a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9388,16 +9388,24 @@ void md_check_recovery(struct mddev *mddev)
 		if (mddev->sb_flags)
 			md_update_sb(mddev, 0);
 
-		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
-		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
-			/* resync/recovery still happening */
-			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-			goto unlock;
-		}
-		if (mddev->sync_thread) {
+		/*
+		 * Never start a new sync thread if MD_RECOVERY_RUNNING is
+		 * still set.
+		 */
+		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+			if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
+				/* resync/recovery still happening */
+				clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+				goto unlock;
+			}
+
+			if (WARN_ON_ONCE(!mddev->sync_thread))
+				goto unlock;
+
 			md_reap_sync_thread(mddev);
 			goto unlock;
 		}
+
 		/* Set RUNNING before clearing NEEDED to avoid
 		 * any transients in the value of "sync_action".
 		 */

From 59cefee75bda5d4cc14f4a1ca861b69091e22c3e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:30 +0200
Subject: [PATCH 029/113] md-bitmap: set BITMAP_WRITE_ERROR in write_sb_page

Set BITMAP_WRITE_ERROR directly in write_sb_page instead of propagating
the error to the caller and setting it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-2-hch@lst.de
---
 drivers/md/md-bitmap.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 1ff712889a3b3..d8469720fac23 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -279,22 +279,20 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 	return 0;
 }
 
-static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-	struct md_rdev *rdev;
 	struct mddev *mddev = bitmap->mddev;
-	int ret;
 
 	do {
-		rdev = NULL;
+		struct md_rdev *rdev = NULL;
+
 		while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
-			ret = __write_sb_page(rdev, bitmap, page);
-			if (ret)
-				return ret;
+			if (__write_sb_page(rdev, bitmap, page) < 0) {
+				set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
+				return;
+			}
 		}
 	} while (wait && md_super_wait(mddev) < 0);
-
-	return 0;
 }
 
 static void md_bitmap_file_kick(struct bitmap *bitmap);
@@ -306,10 +304,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 	struct buffer_head *bh;
 
 	if (bitmap->storage.file == NULL) {
-		switch (write_sb_page(bitmap, page, wait)) {
-		case -EINVAL:
-			set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
-		}
+		write_sb_page(bitmap, page, wait);
 	} else {
 
 		bh = page_buffers(page);

From 546ac0b2e2b15d0af7e6d10506558dded1d9d54a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:31 +0200
Subject: [PATCH 030/113] md-bitmap: initialize variables at declaration time
 in md_bitmap_file_unmap

Just a small tidyup to prepare for bigger changes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-3-hch@lst.de
---
 drivers/md/md-bitmap.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index d8469720fac23..0b2d8933cbc75 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -842,14 +842,10 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
 
 static void md_bitmap_file_unmap(struct bitmap_storage *store)
 {
-	struct page **map, *sb_page;
-	int pages;
-	struct file *file;
-
-	file = store->file;
-	map = store->filemap;
-	pages = store->file_pages;
-	sb_page = store->sb_page;
+	struct file *file = store->file;
+	struct page *sb_page = store->sb_page;
+	struct page **map = store->filemap;
+	int pages = store->file_pages;
 
 	while (pages--)
 		if (map[pages] != sb_page) /* 0 is sb_page, release it below */

From 92348518f23f4fc81caa1a0f7f587566db67b52f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:32 +0200
Subject: [PATCH 031/113] md-bitmap: use %pD to print the file name in
 md_bitmap_file_kick

Don't bother allocating an extra buffer in the I/O failure handler and
instead use the printk built-in format to print the last 4 path name
components.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-4-hch@lst.de
---
 drivers/md/md-bitmap.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 0b2d8933cbc75..e4b466522d4e7 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -870,21 +870,13 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store)
  */
 static void md_bitmap_file_kick(struct bitmap *bitmap)
 {
-	char *path, *ptr = NULL;
-
 	if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
 		md_bitmap_update_sb(bitmap);
 
 		if (bitmap->storage.file) {
-			path = kmalloc(PAGE_SIZE, GFP_KERNEL);
-			if (path)
-				ptr = file_path(bitmap->storage.file,
-					     path, PAGE_SIZE);
-
-			pr_warn("%s: kicking failed bitmap file %s from array!\n",
-				bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
+			pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
+				bmname(bitmap), bitmap->storage.file);
 
-			kfree(path);
 		} else
 			pr_warn("%s: disabling internal bitmap due to errors\n",
 				bmname(bitmap));

From 5339178e5303084da7655874d1aa69a0572c9b79 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:33 +0200
Subject: [PATCH 032/113] md-bitmap: split file writes into a separate helper

Split the file write code out of write_page into a separate helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-5-hch@lst.de
---
 drivers/md/md-bitmap.c | 48 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index e4b466522d4e7..46fbcfc9d1fca 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -296,33 +296,22 @@ static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 }
 
 static void md_bitmap_file_kick(struct bitmap *bitmap);
-/*
- * write out a page to a file
- */
-static void write_page(struct bitmap *bitmap, struct page *page, int wait)
-{
-	struct buffer_head *bh;
 
-	if (bitmap->storage.file == NULL) {
-		write_sb_page(bitmap, page, wait);
-	} else {
-
-		bh = page_buffers(page);
-
-		while (bh && bh->b_blocknr) {
-			atomic_inc(&bitmap->pending_writes);
-			set_buffer_locked(bh);
-			set_buffer_mapped(bh);
-			submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
-			bh = bh->b_this_page;
-		}
+static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+	struct buffer_head *bh = page_buffers(page);
 
-		if (wait)
-			wait_event(bitmap->write_wait,
-				   atomic_read(&bitmap->pending_writes)==0);
+	while (bh && bh->b_blocknr) {
+		atomic_inc(&bitmap->pending_writes);
+		set_buffer_locked(bh);
+		set_buffer_mapped(bh);
+		submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
+		bh = bh->b_this_page;
 	}
-	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
-		md_bitmap_file_kick(bitmap);
+
+	if (wait)
+		wait_event(bitmap->write_wait,
+			   atomic_read(&bitmap->pending_writes) == 0);
 }
 
 static void end_bitmap_write(struct buffer_head *bh, int uptodate)
@@ -429,6 +418,17 @@ static int read_page(struct file *file, unsigned long index,
  * bitmap file superblock operations
  */
 
+/*
+ * write out a page to a file
+ */
+static void write_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+	if (bitmap->storage.file)
+		write_file_page(bitmap, page, wait);
+	else
+		write_sb_page(bitmap, page, wait);
+}
+
 /*
  * md_bitmap_wait_writes() should be called before writing any bitmap
  * blocks, to ensure previous writes, particularly from

From d681054c2f67cfc45042c2de25845b06bb89c148 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:34 +0200
Subject: [PATCH 033/113] md-bitmap: rename read_page to read_file_page

Make the difference to read_sb_page clear.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-6-hch@lst.de
---
 drivers/md/md-bitmap.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 46fbcfc9d1fca..fa0f6ca7b61b0 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -348,10 +348,8 @@ static void free_buffers(struct page *page)
  * This usage is similar to how swap files are handled, and allows us
  * to write to a file with no concerns of memory allocation failing.
  */
-static int read_page(struct file *file, unsigned long index,
-		     struct bitmap *bitmap,
-		     unsigned long count,
-		     struct page *page)
+static int read_file_page(struct file *file, unsigned long index,
+		struct bitmap *bitmap, unsigned long count, struct page *page)
 {
 	int ret = 0;
 	struct inode *inode = file_inode(file);
@@ -632,7 +630,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
 		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
 
-		err = read_page(bitmap->storage.file, 0,
+		err = read_file_page(bitmap->storage.file, 0,
 				bitmap, bytes, sb_page);
 	} else {
 		err = read_sb_page(bitmap->mddev,
@@ -1141,7 +1139,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				count = PAGE_SIZE;
 			page = store->filemap[index];
 			if (file)
-				ret = read_page(file, index, bitmap,
+				ret = read_file_page(file, index, bitmap,
 						count, page);
 			else
 				ret = read_sb_page(

From 844dc6691ad5f53a624f4b07bf84037abbb8fce2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:35 +0200
Subject: [PATCH 034/113] md-bitmap: refactor md_bitmap_init_from_disk

Split the confusing loop in md_bitmap_init_from_disk that iterates over
all chunks but also needs to read and map the pages into three separate
loops: one that iterates over the pages to read them, a second optional
one to iterate over the pages to mark them invalid if the bitmaps are
out of date, and a final one that actually iterates over the chunks.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306160552.smw0qbmb-lkp@intel.com/
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-7-hch@lst.de
---
 drivers/md/md-bitmap.c | 141 ++++++++++++++++++++---------------------
 1 file changed, 70 insertions(+), 71 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index fa0f6ca7b61b0..db5725beaefb6 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1065,33 +1065,31 @@ void md_bitmap_unplug_async(struct bitmap *bitmap)
 EXPORT_SYMBOL(md_bitmap_unplug_async);
 
 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
-/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
- * the in-memory bitmap from the on-disk bitmap -- also, sets up the
- * memory mapping of the bitmap file
- * Special cases:
- *   if there's no bitmap file, or if the bitmap file had been
- *   previously kicked from the array, we mark all the bits as
- *   1's in order to cause a full resync.
+
+/*
+ * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory
+ * mapping of the bitmap file.
+ *
+ * Special case: If there's no bitmap file, or if the bitmap file had been
+ * previously kicked from the array, we mark all the bits as 1's in order to
+ * cause a full resync.
  *
  * We ignore all bits for sectors that end earlier than 'start'.
- * This is used when reading an out-of-date bitmap...
+ * This is used when reading an out-of-date bitmap.
  */
 static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 {
-	unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
-	struct page *page = NULL;
-	unsigned long bit_cnt = 0;
-	struct file *file;
-	unsigned long offset;
-	int outofdate;
-	int ret = -ENOSPC;
-	void *paddr;
+	bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
+	struct mddev *mddev = bitmap->mddev;
+	unsigned long chunks = bitmap->counts.chunks;
 	struct bitmap_storage *store = &bitmap->storage;
+	struct file *file = store->file;
+	unsigned long node_offset = 0;
+	unsigned long bit_cnt = 0;
+	unsigned long i;
+	int ret;
 
-	chunks = bitmap->counts.chunks;
-	file = store->file;
-
-	if (!file && !bitmap->mddev->bitmap_info.offset) {
+	if (!file && !mddev->bitmap_info.offset) {
 		/* No permanent bitmap - fill with '1s'. */
 		store->filemap = NULL;
 		store->file_pages = 0;
@@ -1106,77 +1104,79 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		return 0;
 	}
 
-	outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
-	if (outofdate)
-		pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap));
-
 	if (file && i_size_read(file->f_mapping->host) < store->bytes) {
 		pr_warn("%s: bitmap file too short %lu < %lu\n",
 			bmname(bitmap),
 			(unsigned long) i_size_read(file->f_mapping->host),
 			store->bytes);
+		ret = -ENOSPC;
 		goto err;
 	}
 
-	oldindex = ~0L;
-	offset = 0;
-	if (!bitmap->mddev->bitmap_info.external)
-		offset = sizeof(bitmap_super_t);
-
-	if (mddev_is_clustered(bitmap->mddev))
+	if (mddev_is_clustered(mddev))
 		node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
 
-	for (i = 0; i < chunks; i++) {
-		int b;
-		index = file_page_index(&bitmap->storage, i);
-		bit = file_page_offset(&bitmap->storage, i);
-		if (index != oldindex) { /* this is a new page, read it in */
-			int count;
-			/* unmap the old page, we're done with it */
-			if (index == store->file_pages-1)
-				count = store->bytes - index * PAGE_SIZE;
-			else
-				count = PAGE_SIZE;
-			page = store->filemap[index];
-			if (file)
-				ret = read_file_page(file, index, bitmap,
-						count, page);
-			else
-				ret = read_sb_page(
-					bitmap->mddev,
-					bitmap->mddev->bitmap_info.offset,
-					page,
-					index + node_offset, count);
+	for (i = 0; i < store->file_pages; i++) {
+		struct page *page = store->filemap[i];
+		int count;
 
-			if (ret)
-				goto err;
+		/* unmap the old page, we're done with it */
+		if (i == store->file_pages - 1)
+			count = store->bytes - i * PAGE_SIZE;
+		else
+			count = PAGE_SIZE;
+
+		if (file)
+			ret = read_file_page(file, i, bitmap, count, page);
+		else
+			ret = read_sb_page(mddev, mddev->bitmap_info.offset,
+					   page, i + node_offset, count);
+		if (ret)
+			goto err;
+	}
 
-			oldindex = index;
+	if (outofdate) {
+		pr_warn("%s: bitmap file is out of date, doing full recovery\n",
+			bmname(bitmap));
 
-			if (outofdate) {
-				/*
-				 * if bitmap is out of date, dirty the
-				 * whole page and write it out
-				 */
-				paddr = kmap_atomic(page);
-				memset(paddr + offset, 0xff,
-				       PAGE_SIZE - offset);
-				kunmap_atomic(paddr);
-				write_page(bitmap, page, 1);
+		for (i = 0; i < store->file_pages; i++) {
+			struct page *page = store->filemap[i];
+			unsigned long offset = 0;
+			void *paddr;
+
+			if (i == 0 && !mddev->bitmap_info.external)
+				offset = sizeof(bitmap_super_t);
 
+			/*
+			 * If the bitmap is out of date, dirty the whole page
+			 * and write it out
+			 */
+			paddr = kmap_atomic(page);
+			memset(paddr + offset, 0xff, PAGE_SIZE - offset);
+			kunmap_atomic(paddr);
+
+			write_page(bitmap, page, 1);
+			if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
 				ret = -EIO;
-				if (test_bit(BITMAP_WRITE_ERROR,
-					     &bitmap->flags))
-					goto err;
+				goto err;
 			}
 		}
+	}
+
+	for (i = 0; i < chunks; i++) {
+		struct page *page = filemap_get_page(&bitmap->storage, i);
+		unsigned long bit = file_page_offset(&bitmap->storage, i);
+		void *paddr;
+		bool was_set;
+
 		paddr = kmap_atomic(page);
 		if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
-			b = test_bit(bit, paddr);
+			was_set = test_bit(bit, paddr);
 		else
-			b = test_bit_le(bit, paddr);
+			was_set = test_bit_le(bit, paddr);
 		kunmap_atomic(paddr);
-		if (b) {
+
+		if (was_set) {
 			/* if the disk bit is set, set the memory bit */
 			int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
 				      >= start);
@@ -1185,7 +1185,6 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 						  needed);
 			bit_cnt++;
 		}
-		offset = 0;
 	}
 
 	pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",

From 0c3ea5cc8fbdc3515cfb0c47f5a284882f5e4d80 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:36 +0200
Subject: [PATCH 035/113] md-bitmap: cleanup read_sb_page

Convert read_sb_page to the normal kernel coding style, calculate the
target sector only once, and add a local iosize variable to make the call
to sync_page_io more readable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-8-hch@lst.de
---
 drivers/md/md-bitmap.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index db5725beaefb6..c6dd1fa5a0bee 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -139,26 +139,25 @@ static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page
  */
 
 /* IO operations when bitmap is stored near all superblocks */
+
+/* choose a good rdev and read the page from there */
 static int read_sb_page(struct mddev *mddev, loff_t offset,
-			struct page *page,
-			unsigned long index, int size)
+		struct page *page, unsigned long index, int size)
 {
-	/* choose a good rdev and read the page from there */
 
+	sector_t sector = offset + index * (PAGE_SIZE / SECTOR_SIZE);
 	struct md_rdev *rdev;
-	sector_t target;
 
 	rdev_for_each(rdev, mddev) {
-		if (! test_bit(In_sync, &rdev->flags)
-		    || test_bit(Faulty, &rdev->flags)
-		    || test_bit(Bitmap_sync, &rdev->flags))
-			continue;
+		u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev));
 
-		target = offset + index * (PAGE_SIZE/512);
+		if (!test_bit(In_sync, &rdev->flags) ||
+		    test_bit(Faulty, &rdev->flags) ||
+		    test_bit(Bitmap_sync, &rdev->flags))
+			continue;
 
-		if (sync_page_io(rdev, target,
-				 roundup(size, bdev_logical_block_size(rdev->bdev)),
-				 page, REQ_OP_READ, true)) {
+		if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ,
+				true)) {
 			page->index = index;
 			return 0;
 		}

From f5f2d5ac9f6e807e080311ec36bdf3d6c45b40d4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:37 +0200
Subject: [PATCH 036/113] md-bitmap: account for mddev->bitmap_info.offset in
 read_sb_page

Diretly apply mddev->bitmap_info.offset to the sector number to read
instead of doing that in both callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-9-hch@lst.de
---
 drivers/md/md-bitmap.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index c6dd1fa5a0bee..ae1c6f47b9650 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -145,7 +145,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
 		struct page *page, unsigned long index, int size)
 {
 
-	sector_t sector = offset + index * (PAGE_SIZE / SECTOR_SIZE);
+	sector_t sector = mddev->bitmap_info.offset + offset +
+		index * (PAGE_SIZE / SECTOR_SIZE);
 	struct md_rdev *rdev;
 
 	rdev_for_each(rdev, mddev) {
@@ -593,7 +594,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
 	unsigned long sectors_reserved = 0;
 	int err = -EINVAL;
 	struct page *sb_page;
-	loff_t offset = bitmap->mddev->bitmap_info.offset;
+	loff_t offset = 0;
 
 	if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
 		chunksize = 128 * 1024 * 1024;
@@ -620,7 +621,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
 		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
 		/* to 4k blocks */
 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
-		offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
+		offset = bitmap->cluster_slot * (bm_blocks << 3);
 		pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
 			bitmap->cluster_slot, offset);
 	}
@@ -632,10 +633,8 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
 		err = read_file_page(bitmap->storage.file, 0,
 				bitmap, bytes, sb_page);
 	} else {
-		err = read_sb_page(bitmap->mddev,
-				   offset,
-				   sb_page,
-				   0, sizeof(bitmap_super_t));
+		err = read_sb_page(bitmap->mddev, offset, sb_page, 0,
+				   sizeof(bitmap_super_t));
 	}
 	if (err)
 		return err;
@@ -1128,8 +1127,8 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		if (file)
 			ret = read_file_page(file, i, bitmap, count, page);
 		else
-			ret = read_sb_page(mddev, mddev->bitmap_info.offset,
-					   page, i + node_offset, count);
+			ret = read_sb_page(mddev, 0, page, i + node_offset,
+					   count);
 		if (ret)
 			goto err;
 	}

From d7038f951828da19fa9aafddfa087b69032c9687 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:38 +0200
Subject: [PATCH 037/113] md-bitmap: don't use ->index for pages backing the
 bitmap file

The md driver allocates pages for storing the bitmap file data, which
are not page cache pages, and then stores the page granularity file
offset in page->index, which is a field that isn't really valid except
for page cache pages.

Use a separate index for the superblock, and use the scheme used at
read size to recalculate the index for the bitmap pages instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-10-hch@lst.de
---
 drivers/md/md-bitmap.c | 65 ++++++++++++++++++++++++------------------
 drivers/md/md-bitmap.h |  1 +
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index ae1c6f47b9650..a280bfd29f650 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -157,11 +157,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
 		    test_bit(Bitmap_sync, &rdev->flags))
 			continue;
 
-		if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ,
-				true)) {
-			page->index = index;
+		if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true))
 			return 0;
-		}
 	}
 	return -EIO;
 }
@@ -225,18 +222,19 @@ static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
 }
 
 static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
-			   struct page *page)
+			   unsigned long pg_index, struct page *page)
 {
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;
 	loff_t sboff, offset = mddev->bitmap_info.offset;
-	sector_t ps, doff;
+	sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
 	unsigned int size = PAGE_SIZE;
 	unsigned int opt_size = PAGE_SIZE;
+	sector_t doff;
 
 	bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
-	if (page->index == store->file_pages - 1) {
+	if (pg_index == store->file_pages - 1) {
 		unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
 
 		if (last_page_size == 0)
@@ -245,7 +243,6 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 		opt_size = optimal_io_size(bdev, last_page_size, size);
 	}
 
-	ps = page->index * PAGE_SIZE / SECTOR_SIZE;
 	sboff = rdev->sb_start + offset;
 	doff = rdev->data_offset;
 
@@ -279,7 +276,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 	return 0;
 }
 
-static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
+			  struct page *page, bool wait)
 {
 	struct mddev *mddev = bitmap->mddev;
 
@@ -287,7 +285,7 @@ static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 		struct md_rdev *rdev = NULL;
 
 		while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
-			if (__write_sb_page(rdev, bitmap, page) < 0) {
+			if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) {
 				set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
 				return;
 			}
@@ -397,7 +395,6 @@ static int read_file_page(struct file *file, unsigned long index,
 		blk_cur++;
 		bh = bh->b_this_page;
 	}
-	page->index = index;
 
 	wait_event(bitmap->write_wait,
 		   atomic_read(&bitmap->pending_writes)==0);
@@ -419,12 +416,21 @@ static int read_file_page(struct file *file, unsigned long index,
 /*
  * write out a page to a file
  */
-static void write_page(struct bitmap *bitmap, struct page *page, int wait)
+static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
+			       bool wait)
 {
-	if (bitmap->storage.file)
+	struct bitmap_storage *store = &bitmap->storage;
+	struct page *page = store->filemap[pg_index];
+
+	if (mddev_is_clustered(bitmap->mddev)) {
+		pg_index += bitmap->cluster_slot *
+			DIV_ROUND_UP(store->bytes, PAGE_SIZE);
+	}
+
+	if (store->file)
 		write_file_page(bitmap, page, wait);
 	else
-		write_sb_page(bitmap, page, wait);
+		write_sb_page(bitmap, pg_index, page, wait);
 }
 
 /*
@@ -481,7 +487,12 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
 	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
 					   bitmap_info.space);
 	kunmap_atomic(sb);
-	write_page(bitmap, bitmap->storage.sb_page, 1);
+
+	if (bitmap->storage.file)
+		write_file_page(bitmap, bitmap->storage.sb_page, 1);
+	else
+		write_sb_page(bitmap, bitmap->storage.sb_index,
+			      bitmap->storage.sb_page, 1);
 }
 EXPORT_SYMBOL(md_bitmap_update_sb);
 
@@ -533,7 +544,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
 	bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (bitmap->storage.sb_page == NULL)
 		return -ENOMEM;
-	bitmap->storage.sb_page->index = 0;
+	bitmap->storage.sb_index = 0;
 
 	sb = kmap_atomic(bitmap->storage.sb_page);
 
@@ -810,7 +821,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
 	if (store->sb_page) {
 		store->filemap[0] = store->sb_page;
 		pnum = 1;
-		store->sb_page->index = offset;
+		store->sb_index = offset;
 	}
 
 	for ( ; pnum < num_pages; pnum++) {
@@ -819,7 +830,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
 			store->file_pages = pnum;
 			return -ENOMEM;
 		}
-		store->filemap[pnum]->index = pnum + offset;
 	}
 	store->file_pages = pnum;
 
@@ -924,6 +934,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	void *kaddr;
 	unsigned long chunk = block >> bitmap->counts.chunkshift;
 	struct bitmap_storage *store = &bitmap->storage;
+	unsigned long index = file_page_index(store, chunk);
 	unsigned long node_offset = 0;
 
 	if (mddev_is_clustered(bitmap->mddev))
@@ -941,9 +952,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	else
 		set_bit_le(bit, kaddr);
 	kunmap_atomic(kaddr);
-	pr_debug("set file bit %lu page %lu\n", bit, page->index);
+	pr_debug("set file bit %lu page %lu\n", bit, index);
 	/* record page number so it gets flushed to disk when unplug occurs */
-	set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY);
+	set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
 }
 
 static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
@@ -953,6 +964,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	void *paddr;
 	unsigned long chunk = block >> bitmap->counts.chunkshift;
 	struct bitmap_storage *store = &bitmap->storage;
+	unsigned long index = file_page_index(store, chunk);
 	unsigned long node_offset = 0;
 
 	if (mddev_is_clustered(bitmap->mddev))
@@ -968,8 +980,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	else
 		clear_bit_le(bit, paddr);
 	kunmap_atomic(paddr);
-	if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
-		set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING);
+	if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
+		set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
 		bitmap->allclean = 0;
 	}
 }
@@ -1021,7 +1033,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
 							  "md bitmap_unplug");
 			}
 			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
-			write_page(bitmap, bitmap->storage.filemap[i], 0);
+			filemap_write_page(bitmap, i, false);
 			writing = 1;
 		}
 	}
@@ -1153,7 +1165,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 			memset(paddr + offset, 0xff, PAGE_SIZE - offset);
 			kunmap_atomic(paddr);
 
-			write_page(bitmap, page, 1);
+			filemap_write_page(bitmap, i, true);
 			if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
 				ret = -EIO;
 				goto err;
@@ -1374,9 +1386,8 @@ void md_bitmap_daemon_work(struct mddev *mddev)
 			break;
 		if (bitmap->storage.filemap &&
 		    test_and_clear_page_attr(bitmap, j,
-					     BITMAP_PAGE_NEEDWRITE)) {
-			write_page(bitmap, bitmap->storage.filemap[j], 0);
-		}
+					     BITMAP_PAGE_NEEDWRITE))
+			filemap_write_page(bitmap, j, false);
 	}
 
  done:
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 8a3788c9bfef8..bb9eb418780a6 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -201,6 +201,7 @@ struct bitmap {
 		struct file *file;		/* backing disk file */
 		struct page *sb_page;		/* cached copy of the bitmap
 						 * file superblock */
+		unsigned long sb_index;
 		struct page **filemap;		/* list of cache pages for
 						 * the file */
 		unsigned long *filemap_attr;	/* attributes associated

From a34d4ef82c3c4bd8bda817e9fb53ef37c5595ddd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:39 +0200
Subject: [PATCH 038/113] md: make bitmap file support optional

The support for write intent bitmaps in files on an external files in md
is a hot mess that abuses ->bmap to map file offsets into physical device
objects, and also abuses buffer_heads in a creative way.

Make this code optional so that MD can be built into future kernels
without buffer_head support, and so that we can eventually deprecate it.

Note this does not affect the internal bitmap support, which has none of
the problems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-11-hch@lst.de
---
 drivers/md/Kconfig     | 10 ++++++++++
 drivers/md/md-bitmap.c | 15 +++++++++++++++
 drivers/md/md.c        |  7 +++++++
 3 files changed, 32 insertions(+)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b0a22e99bade3..9712ab9bcba52 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -50,6 +50,16 @@ config MD_AUTODETECT
 
 	  If unsure, say Y.
 
+config MD_BITMAP_FILE
+	bool "MD bitmap file support"
+	default y
+	help
+	  If you say Y here, support for write intent bitmaps in files on an
+	  external file system is enabled.  This is an alternative to the internal
+	  bitmaps near the MD superblock, and very problematic code that abuses
+	  various kernel APIs and can only work with files on a file system not
+	  actually sitting on the MD device.
+
 config MD_LINEAR
 	tristate "Linear (append) mode (deprecated)"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index a280bfd29f650..a58a4c30265e6 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -295,6 +295,7 @@ static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
 
 static void md_bitmap_file_kick(struct bitmap *bitmap);
 
+#ifdef CONFIG_MD_BITMAP_FILE
 static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
 {
 	struct buffer_head *bh = page_buffers(page);
@@ -408,6 +409,20 @@ static int read_file_page(struct file *file, unsigned long index,
 		       ret);
 	return ret;
 }
+#else /* CONFIG_MD_BITMAP_FILE */
+static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+}
+static int read_file_page(struct file *file, unsigned long index,
+		struct bitmap *bitmap, unsigned long count, struct page *page)
+{
+	return -EIO;
+}
+static void free_buffers(struct page *page)
+{
+	put_page(page);
+}
+#endif /* CONFIG_MD_BITMAP_FILE */
 
 /*
  * bitmap file superblock operations
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 320d71537359a..f46996a95b0cf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7043,6 +7043,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 
 		if (mddev->bitmap || mddev->bitmap_info.file)
 			return -EEXIST; /* cannot add when bitmap is present */
+
+		if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
+			pr_warn("%s: bitmap files not supported by this kernel\n",
+				mdname(mddev));
+			return -EINVAL;
+		}
+
 		f = fget(fd);
 
 		if (f == NULL) {

From 0ae1c9d38426737c39085f919b9b27d2eab3802e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:40 +0200
Subject: [PATCH 039/113] md: deprecate bitmap file support

The support for bitmaps on files is a very bad idea abusing various kernel
APIs, and fundamentally requires the file to not be on the actual array
without a way to check that this is actually the case.  Add a deprecation
warning to see if we might be able to eventually drop it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-12-hch@lst.de
---
 drivers/md/Kconfig | 2 +-
 drivers/md/md.c    | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 9712ab9bcba52..444517d1a2336 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -51,7 +51,7 @@ config MD_AUTODETECT
 	  If unsure, say Y.
 
 config MD_BITMAP_FILE
-	bool "MD bitmap file support"
+	bool "MD bitmap file support (deprecated)"
 	default y
 	help
 	  If you say Y here, support for write intent bitmaps in files on an
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f46996a95b0cf..f8774b1ef0aab 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7049,6 +7049,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 				mdname(mddev));
 			return -EINVAL;
 		}
+		pr_warn("%s: using deprecated bitmap file support\n",
+			mdname(mddev));
 
 		f = fget(fd);
 

From c567c86b90d4715081adfe5eb812141a5b6b4883 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:03 +0800
Subject: [PATCH 040/113] md: move initialization and destruction of
 'io_acct_set' to md.c

'io_acct_set' is only used for raid0 and raid456, prepare to use it for
raid1 and raid10, so that io accounting from different levels can be
consistent.

By the way, follow up patches will also use this io clone mechanism to
make sure 'active_io' represents in flight io, not io that is dispatching,
so that mddev_suspend will wait for io to be done as designed.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-2-yukuai1@huaweicloud.com
---
 drivers/md/md.c    | 27 ++++++++++-----------------
 drivers/md/md.h    |  2 --
 drivers/md/raid0.c | 16 ++--------------
 drivers/md/raid5.c | 41 +++++++++++------------------------------
 4 files changed, 23 insertions(+), 63 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f8774b1ef0aab..1a0844250b9b0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5876,6 +5876,13 @@ int md_run(struct mddev *mddev)
 			goto exit_bio_set;
 	}
 
+	if (!bioset_initialized(&mddev->io_acct_set)) {
+		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
+				  offsetof(struct md_io_acct, bio_clone), 0);
+		if (err)
+			goto exit_sync_set;
+	}
+
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
 	if (!pers || !try_module_get(pers->owner)) {
@@ -6053,6 +6060,8 @@ int md_run(struct mddev *mddev)
 	module_put(pers->owner);
 	md_bitmap_destroy(mddev);
 abort:
+	bioset_exit(&mddev->io_acct_set);
+exit_sync_set:
 	bioset_exit(&mddev->sync_set);
 exit_bio_set:
 	bioset_exit(&mddev->bio_set);
@@ -6276,6 +6285,7 @@ static void __md_stop(struct mddev *mddev)
 	percpu_ref_exit(&mddev->active_io);
 	bioset_exit(&mddev->bio_set);
 	bioset_exit(&mddev->sync_set);
+	bioset_exit(&mddev->io_acct_set);
 }
 
 void md_stop(struct mddev *mddev)
@@ -8641,23 +8651,6 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 }
 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
 
-int acct_bioset_init(struct mddev *mddev)
-{
-	int err = 0;
-
-	if (!bioset_initialized(&mddev->io_acct_set))
-		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
-			offsetof(struct md_io_acct, bio_clone), 0);
-	return err;
-}
-EXPORT_SYMBOL_GPL(acct_bioset_init);
-
-void acct_bioset_exit(struct mddev *mddev)
-{
-	bioset_exit(&mddev->io_acct_set);
-}
-EXPORT_SYMBOL_GPL(acct_bioset_exit);
-
 static void md_end_io_acct(struct bio *bio)
 {
 	struct md_io_acct *md_io_acct = bio->bi_private;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 914e6ece9af29..4d771e5d3c717 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -774,8 +774,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
 extern void md_finish_reshape(struct mddev *mddev);
 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 			struct bio *bio, sector_t start, sector_t size);
-int acct_bioset_init(struct mddev *mddev);
-void acct_bioset_exit(struct mddev *mddev);
 void md_account_bio(struct mddev *mddev, struct bio **bio);
 
 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d1ac73fcd8529..4106d943aae75 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -377,7 +377,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
 	struct r0conf *conf = priv;
 
 	free_conf(mddev, conf);
-	acct_bioset_exit(mddev);
 }
 
 static int raid0_run(struct mddev *mddev)
@@ -392,16 +391,11 @@ static int raid0_run(struct mddev *mddev)
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
 
-	if (acct_bioset_init(mddev)) {
-		pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
-		return -ENOMEM;
-	}
-
 	/* if private is not null, we are here after takeover */
 	if (mddev->private == NULL) {
 		ret = create_strip_zones(mddev, &conf);
 		if (ret < 0)
-			goto exit_acct_set;
+			return ret;
 		mddev->private = conf;
 	}
 	conf = mddev->private;
@@ -432,15 +426,9 @@ static int raid0_run(struct mddev *mddev)
 
 	ret = md_integrity_register(mddev);
 	if (ret)
-		goto free;
+		free_conf(mddev, conf);
 
 	return ret;
-
-free:
-	free_conf(mddev, conf);
-exit_acct_set:
-	acct_bioset_exit(mddev);
-	return ret;
 }
 
 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 85b3004594e03..db3cec2282371 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7787,19 +7787,12 @@ static int raid5_run(struct mddev *mddev)
 	struct md_rdev *rdev;
 	struct md_rdev *journal_dev = NULL;
 	sector_t reshape_offset = 0;
-	int i, ret = 0;
+	int i;
 	long long min_offset_diff = 0;
 	int first = 1;
 
-	if (acct_bioset_init(mddev)) {
-		pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
+	if (mddev_init_writes_pending(mddev) < 0)
 		return -ENOMEM;
-	}
-
-	if (mddev_init_writes_pending(mddev) < 0) {
-		ret = -ENOMEM;
-		goto exit_acct_set;
-	}
 
 	if (mddev->recovery_cp != MaxSector)
 		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@@ -7830,8 +7823,7 @@ static int raid5_run(struct mddev *mddev)
 	    (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
 		pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
 			  mdname(mddev));
-		ret = -EINVAL;
-		goto exit_acct_set;
+		return -EINVAL;
 	}
 
 	if (mddev->reshape_position != MaxSector) {
@@ -7856,15 +7848,13 @@ static int raid5_run(struct mddev *mddev)
 		if (journal_dev) {
 			pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 
 		if (mddev->new_level != mddev->level) {
 			pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 		old_disks = mddev->raid_disks - mddev->delta_disks;
 		/* reshape_position must be on a new-stripe boundary, and one
@@ -7880,8 +7870,7 @@ static int raid5_run(struct mddev *mddev)
 		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
 			pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 		reshape_offset = here_new * chunk_sectors;
 		/* here_new is the stripe we will write to */
@@ -7903,8 +7892,7 @@ static int raid5_run(struct mddev *mddev)
 			else if (mddev->ro == 0) {
 				pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
 					mdname(mddev));
-				ret = -EINVAL;
-				goto exit_acct_set;
+				return -EINVAL;
 			}
 		} else if (mddev->reshape_backwards
 		    ? (here_new * chunk_sectors + min_offset_diff <=
@@ -7914,8 +7902,7 @@ static int raid5_run(struct mddev *mddev)
 			/* Reading from the same stripe as writing to - bad */
 			pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 		pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
 		/* OK, we should be able to continue; */
@@ -7939,10 +7926,8 @@ static int raid5_run(struct mddev *mddev)
 	else
 		conf = mddev->private;
 
-	if (IS_ERR(conf)) {
-		ret = PTR_ERR(conf);
-		goto exit_acct_set;
-	}
+	if (IS_ERR(conf))
+		return PTR_ERR(conf);
 
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
 		if (!journal_dev) {
@@ -8140,10 +8125,7 @@ static int raid5_run(struct mddev *mddev)
 	free_conf(conf);
 	mddev->private = NULL;
 	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
-	ret = -EIO;
-exit_acct_set:
-	acct_bioset_exit(mddev);
-	return ret;
+	return -EIO;
 }
 
 static void raid5_free(struct mddev *mddev, void *priv)
@@ -8151,7 +8133,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
 	struct r5conf *conf = priv;
 
 	free_conf(conf);
-	acct_bioset_exit(mddev);
 	mddev->to_remove = &raid5_attrs_group;
 }
 

From c687297b884507a4737b747957eda567063901df Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:04 +0800
Subject: [PATCH 041/113] md: also clone new io if io accounting is disabled

Currently, 'active_io' is grabbed before make_reqeust() is called, and
it's dropped immediately make_reqeust() returns. Hence 'active_io'
actually means io is dispatching, not io is inflight.

For raid0 and raid456 that io accounting is enabled, 'active_io' will
also be grabbed when bio is cloned for io accounting, and this 'active_io'
is dropped until io is done.

Always clone new bio so that 'active_io' will mean that io is inflight,
raid1 and raid10 will switch to use this method in later patches.

Now that bio will be cloned even if io accounting is disabled, also
rename related structure from '*_acct_*' to '*_clone_*'.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-3-yukuai1@huaweicloud.com
---
 drivers/md/md.c    | 63 +++++++++++++++++++++++-----------------------
 drivers/md/md.h    |  4 +--
 drivers/md/raid5.c | 18 ++++++-------
 3 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1a0844250b9b0..abb6167203938 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2306,7 +2306,7 @@ int md_integrity_register(struct mddev *mddev)
 	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
 	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
 	    (mddev->level != 1 && mddev->level != 10 &&
-	     bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
+	     bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
 		/*
 		 * No need to handle the failure of bioset_integrity_create,
 		 * because the function is called by md_run() -> pers->run(),
@@ -5876,9 +5876,9 @@ int md_run(struct mddev *mddev)
 			goto exit_bio_set;
 	}
 
-	if (!bioset_initialized(&mddev->io_acct_set)) {
-		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
-				  offsetof(struct md_io_acct, bio_clone), 0);
+	if (!bioset_initialized(&mddev->io_clone_set)) {
+		err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
+				  offsetof(struct md_io_clone, bio_clone), 0);
 		if (err)
 			goto exit_sync_set;
 	}
@@ -6060,7 +6060,7 @@ int md_run(struct mddev *mddev)
 	module_put(pers->owner);
 	md_bitmap_destroy(mddev);
 abort:
-	bioset_exit(&mddev->io_acct_set);
+	bioset_exit(&mddev->io_clone_set);
 exit_sync_set:
 	bioset_exit(&mddev->sync_set);
 exit_bio_set:
@@ -6285,7 +6285,7 @@ static void __md_stop(struct mddev *mddev)
 	percpu_ref_exit(&mddev->active_io);
 	bioset_exit(&mddev->bio_set);
 	bioset_exit(&mddev->sync_set);
-	bioset_exit(&mddev->io_acct_set);
+	bioset_exit(&mddev->io_clone_set);
 }
 
 void md_stop(struct mddev *mddev)
@@ -8651,45 +8651,44 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 }
 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
 
-static void md_end_io_acct(struct bio *bio)
+static void md_end_clone_io(struct bio *bio)
 {
-	struct md_io_acct *md_io_acct = bio->bi_private;
-	struct bio *orig_bio = md_io_acct->orig_bio;
-	struct mddev *mddev = md_io_acct->mddev;
+	struct md_io_clone *md_io_clone = bio->bi_private;
+	struct bio *orig_bio = md_io_clone->orig_bio;
+	struct mddev *mddev = md_io_clone->mddev;
 
 	orig_bio->bi_status = bio->bi_status;
 
-	bio_end_io_acct(orig_bio, md_io_acct->start_time);
+	if (md_io_clone->start_time)
+		bio_end_io_acct(orig_bio, md_io_clone->start_time);
+
 	bio_put(bio);
 	bio_endio(orig_bio);
-
 	percpu_ref_put(&mddev->active_io);
 }
 
-/*
- * Used by personalities that don't already clone the bio and thus can't
- * easily add the timestamp to their extended bio structure.
- */
-void md_account_bio(struct mddev *mddev, struct bio **bio)
+static void md_clone_bio(struct mddev *mddev, struct bio **bio)
 {
 	struct block_device *bdev = (*bio)->bi_bdev;
-	struct md_io_acct *md_io_acct;
-	struct bio *clone;
-
-	if (!blk_queue_io_stat(bdev->bd_disk->queue))
-		return;
+	struct md_io_clone *md_io_clone;
+	struct bio *clone =
+		bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
+
+	md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
+	md_io_clone->orig_bio = *bio;
+	md_io_clone->mddev = mddev;
+	if (blk_queue_io_stat(bdev->bd_disk->queue))
+		md_io_clone->start_time = bio_start_io_acct(*bio);
+
+	clone->bi_end_io = md_end_clone_io;
+	clone->bi_private = md_io_clone;
+	*bio = clone;
+}
 
+void md_account_bio(struct mddev *mddev, struct bio **bio)
+{
 	percpu_ref_get(&mddev->active_io);
-
-	clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
-	md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
-	md_io_acct->orig_bio = *bio;
-	md_io_acct->start_time = bio_start_io_acct(*bio);
-	md_io_acct->mddev = mddev;
-
-	clone->bi_end_io = md_end_io_acct;
-	clone->bi_private = md_io_acct;
-	*bio = clone;
+	md_clone_bio(mddev, bio);
 }
 EXPORT_SYMBOL_GPL(md_account_bio);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4d771e5d3c717..8ae9574809763 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -510,7 +510,7 @@ struct mddev {
 	struct bio_set			sync_set; /* for sync operations like
 						   * metadata and bitmap writes
 						   */
-	struct bio_set			io_acct_set; /* for raid0 and raid5 io accounting */
+	struct bio_set			io_clone_set;
 
 	/* Generic flush handling.
 	 * The last to finish preflush schedules a worker to submit
@@ -736,7 +736,7 @@ struct md_thread {
 	void			*private;
 };
 
-struct md_io_acct {
+struct md_io_clone {
 	struct mddev	*mddev;
 	struct bio	*orig_bio;
 	unsigned long	start_time;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index db3cec2282371..1da9dd3e2f18f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5468,13 +5468,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
  */
 static void raid5_align_endio(struct bio *bi)
 {
-	struct md_io_acct *md_io_acct = bi->bi_private;
-	struct bio *raid_bi = md_io_acct->orig_bio;
+	struct md_io_clone *md_io_clone = bi->bi_private;
+	struct bio *raid_bi = md_io_clone->orig_bio;
 	struct mddev *mddev;
 	struct r5conf *conf;
 	struct md_rdev *rdev;
 	blk_status_t error = bi->bi_status;
-	unsigned long start_time = md_io_acct->start_time;
+	unsigned long start_time = md_io_clone->start_time;
 
 	bio_put(bi);
 
@@ -5506,7 +5506,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	struct md_rdev *rdev;
 	sector_t sector, end_sector, first_bad;
 	int bad_sectors, dd_idx;
-	struct md_io_acct *md_io_acct;
+	struct md_io_clone *md_io_clone;
 	bool did_inc;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5544,15 +5544,15 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	}
 
 	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
-				    &mddev->io_acct_set);
-	md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
+				    &mddev->io_clone_set);
+	md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
 	raid_bio->bi_next = (void *)rdev;
 	if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
-		md_io_acct->start_time = bio_start_io_acct(raid_bio);
-	md_io_acct->orig_bio = raid_bio;
+		md_io_clone->start_time = bio_start_io_acct(raid_bio);
+	md_io_clone->orig_bio = raid_bio;
 
 	align_bio->bi_end_io = raid5_align_endio;
-	align_bio->bi_private = md_io_acct;
+	align_bio->bi_private = md_io_clone;
 	align_bio->bi_iter.bi_sector = sector;
 
 	/* No reshape active, so we can trust rdev->data_offset */

From 05048cbccab79e9fb9030274170ccd710fe69474 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:05 +0800
Subject: [PATCH 042/113] raid5: fix missing io accounting in
 raid5_align_endio()

Io will only be accounted as done from raid5_align_endio() if the io
succeeded, and io inflight counter will be leaked if such io failed.

Fix this problem by switching to use md_account_bio() for io accounting.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-4-yukuai1@huaweicloud.com
---
 drivers/md/raid5.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1da9dd3e2f18f..32a87193bad73 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
  */
 static void raid5_align_endio(struct bio *bi)
 {
-	struct md_io_clone *md_io_clone = bi->bi_private;
-	struct bio *raid_bi = md_io_clone->orig_bio;
-	struct mddev *mddev;
-	struct r5conf *conf;
-	struct md_rdev *rdev;
+	struct bio *raid_bi = bi->bi_private;
+	struct md_rdev *rdev = (void *)raid_bi->bi_next;
+	struct mddev *mddev = rdev->mddev;
+	struct r5conf *conf = mddev->private;
 	blk_status_t error = bi->bi_status;
-	unsigned long start_time = md_io_clone->start_time;
 
 	bio_put(bi);
-
-	rdev = (void*)raid_bi->bi_next;
 	raid_bi->bi_next = NULL;
-	mddev = rdev->mddev;
-	conf = mddev->private;
-
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error) {
-		if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
-			bio_end_io_acct(raid_bi, start_time);
 		bio_endio(raid_bi);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_quiescent);
@@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	struct md_rdev *rdev;
 	sector_t sector, end_sector, first_bad;
 	int bad_sectors, dd_idx;
-	struct md_io_clone *md_io_clone;
 	bool did_inc;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		return 0;
 	}
 
-	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
-				    &mddev->io_clone_set);
-	md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
+	md_account_bio(mddev, &raid_bio);
 	raid_bio->bi_next = (void *)rdev;
-	if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
-		md_io_clone->start_time = bio_start_io_acct(raid_bio);
-	md_io_clone->orig_bio = raid_bio;
 
+	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
+				    &mddev->bio_set);
 	align_bio->bi_end_io = raid5_align_endio;
-	align_bio->bi_private = md_io_clone;
+	align_bio->bi_private = raid_bio;
 	align_bio->bi_iter.bi_sector = sector;
 
 	/* No reshape active, so we can trust rdev->data_offset */

From bb2a9acefaf9ce5bbc1e70f407e34599233d0243 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:06 +0800
Subject: [PATCH 043/113] md/raid1: switch to use md_account_bio() for io
 accounting

Two problems can be fixed this way:

1) 'active_io' will represent inflight io instead of io that is
dispatching.

2) If io accounting is enabled or disabled while io is still inflight,
bio_start_io_acct() and bio_end_io_acct() is not balanced and io
inflight counter will be leaked.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-5-yukuai1@huaweicloud.com
---
 drivers/md/raid1.c | 14 ++++++--------
 drivers/md/raid1.h |  1 -
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index dd25832eb0452..06fa1580501f0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
 		bio->bi_status = BLK_STS_IOERR;
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		bio_end_io_acct(bio, r1_bio->start_time);
 	bio_endio(bio);
 }
 
@@ -1303,10 +1301,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	}
 
 	r1_bio->read_disk = rdisk;
-
-	if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r1_bio->start_time = bio_start_io_acct(bio);
-
+	if (!r1bio_existed) {
+		md_account_bio(mddev, &bio);
+		r1_bio->master_bio = bio;
+	}
 	read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
 				   &mddev->bio_set);
 
@@ -1500,8 +1498,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		r1_bio->sectors = max_sectors;
 	}
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r1_bio->start_time = bio_start_io_acct(bio);
+	md_account_bio(mddev, &bio);
+	r1_bio->master_bio = bio;
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 468f189da7a05..14d4211a123a8 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -157,7 +157,6 @@ struct r1bio {
 	sector_t		sector;
 	int			sectors;
 	unsigned long		state;
-	unsigned long		start_time;
 	struct mddev		*mddev;
 	/*
 	 * original bio going to /dev/mdx

From 820455238366a78a44a85cc7d58a987e728464d9 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:07 +0800
Subject: [PATCH 044/113] md/raid10: switch to use md_account_bio() for io
 accounting

Make sure that 'active_io' will represent inflight io instead of io that
is dispatching, and io accounting from all levels will be consistent.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-6-yukuai1@huaweicloud.com
---
 drivers/md/raid10.c | 20 +++++++++-----------
 drivers/md/raid10.h |  1 -
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5051149e27bbe..d42e9b7d26083 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		bio->bi_status = BLK_STS_IOERR;
 
-	if (r10_bio->start_time)
-		bio_end_io_acct(bio, r10_bio->start_time);
 	bio_endio(bio);
 	/*
 	 * Wake up any possible resync thread that waits for the device
@@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
 }
 
 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
-				struct r10bio *r10_bio)
+				struct r10bio *r10_bio, bool io_accounting)
 {
 	struct r10conf *conf = mddev->private;
 	struct bio *read_bio;
@@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	}
 	slot = r10_bio->read_slot;
 
-	if (!r10_bio->start_time &&
-	    blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r10_bio->start_time = bio_start_io_acct(bio);
+	if (io_accounting) {
+		md_account_bio(mddev, &bio);
+		r10_bio->master_bio = bio;
+	}
 	read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
 
 	r10_bio->devs[slot].bio = read_bio;
@@ -1543,8 +1542,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		r10_bio->master_bio = bio;
 	}
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r10_bio->start_time = bio_start_io_acct(bio);
+	md_account_bio(mddev, &bio);
+	r10_bio->master_bio = bio;
 	atomic_set(&r10_bio->remaining, 1);
 	md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 
@@ -1571,12 +1570,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
 	r10_bio->read_slot = -1;
-	r10_bio->start_time = 0;
 	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
 			conf->geo.raid_disks);
 
 	if (bio_data_dir(bio) == READ)
-		raid10_read_request(mddev, bio, r10_bio);
+		raid10_read_request(mddev, bio, r10_bio, true);
 	else
 		raid10_write_request(mddev, bio, r10_bio);
 }
@@ -2985,7 +2983,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 
 	rdev_dec_pending(rdev, mddev);
 	r10_bio->state = 0;
-	raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
+	raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
 	/*
 	 * allow_barrier after re-submit to ensure no sync io
 	 * can be issued while regular io pending.
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 63e48b11b5529..2e75e88d08023 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -123,7 +123,6 @@ struct r10bio {
 	sector_t		sector;	/* virtual sector number */
 	int			sectors;
 	unsigned long		state;
-	unsigned long		start_time;
 	struct mddev		*mddev;
 	/*
 	 * original bio going to /dev/mdx

From bdf2b52136dd19a55aaf5484cb254498c61383a5 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:08 +0800
Subject: [PATCH 045/113] md/md-multipath: enable io accounting

use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-7-yukuai1@huaweicloud.com
---
 drivers/md/md-multipath.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 92c45be203d7e..d22276870283d 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 	    && md_flush_request(mddev, bio))
 		return true;
 
+	md_account_bio(mddev, &bio);
 	mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
 
 	mp_bh->master_bio = bio;

From 09f43cb530b03e4d58b35a39e54de658fc8d09b7 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:09 +0800
Subject: [PATCH 046/113] md/md-linear: enable io accounting

use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-8-yukuai1@huaweicloud.com
---
 drivers/md/md-linear.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 4eb72b9dd9336..71ac99646827b 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
 		bio = split;
 	}
 
+	md_account_bio(mddev, &bio);
 	bio_set_dev(bio, tmp_dev->rdev->bdev);
 	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
 		start_sector + data_offset;

From dd9a68601409d905810a936a7c4e1241b604013f Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:10 +0800
Subject: [PATCH 047/113] md/md-faulty: enable io accounting

use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-9-yukuai1@huaweicloud.com
---
 drivers/md/md-faulty.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index 50ad818978a43..a039e8e20f55a 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
 			failit = 1;
 		}
 	}
+
+	md_account_bio(mddev, &bio);
 	if (failit) {
 		struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
 						&mddev->bio_set);

From ffb1e7a03f966065323b18c96da23a2118a19529 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Tue, 27 Jun 2023 09:43:32 +0800
Subject: [PATCH 048/113] md/raid1: prioritize adding disk to 'removed' mirror

New disk should be added to "removed" position first instead of to be a
replacement. Commit 6090368abcb4 ("md/raid10: prioritize adding disk to
'removed' mirror") has fixed this issue for raid10. Fix it for raid1 now.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230627014332.3810102-1-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 06fa1580501f0..f834d99a36f6d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1764,7 +1764,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r1conf *conf = mddev->private;
 	int err = -EEXIST;
-	int mirror = 0;
+	int mirror = 0, repl_slot = -1;
 	struct raid1_info *p;
 	int first = 0;
 	int last = conf->raid_disks - 1;
@@ -1807,17 +1807,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			break;
 		}
 		if (test_bit(WantReplacement, &p->rdev->flags) &&
-		    p[conf->raid_disks].rdev == NULL) {
-			/* Add this device as a replacement */
-			clear_bit(In_sync, &rdev->flags);
-			set_bit(Replacement, &rdev->flags);
-			rdev->raid_disk = mirror;
-			err = 0;
-			conf->fullsync = 1;
-			rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
-			break;
-		}
+		    p[conf->raid_disks].rdev == NULL && repl_slot < 0)
+			repl_slot = mirror;
 	}
+
+	if (err && repl_slot >= 0) {
+		/* Add this device as a replacement */
+		p = conf->mirrors + repl_slot;
+		clear_bit(In_sync, &rdev->flags);
+		set_bit(Replacement, &rdev->flags);
+		rdev->raid_disk = repl_slot;
+		err = 0;
+		conf->fullsync = 1;
+		rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
+	}
+
 	print_conf(conf);
 	return err;
 }

From 605eeda6e70f692311b36180f217208d367476f6 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 24 Jun 2023 01:32:34 +0800
Subject: [PATCH 049/113] md/raid10: optimize fix_read_error

We dereference r10_bio->read_slot too many times in fix_read_error().
Optimize it by using a variable to store read_slot.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230623173236.2513554-2-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid10.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d42e9b7d26083..abea91a54db1d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2718,10 +2718,10 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int sect = 0; /* Offset from r10_bio->sector */
-	int sectors = r10_bio->sectors;
+	int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
 	struct md_rdev *rdev;
 	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
-	int d = r10_bio->devs[r10_bio->read_slot].devnum;
+	int d = r10_bio->devs[slot].devnum;
 
 	/* still own a reference to this rdev, so it cannot
 	 * have been cleared recently.
@@ -2742,13 +2742,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 		pr_notice("md/raid10:%s: %pg: Failing raid device\n",
 			  mdname(mddev), rdev->bdev);
 		md_error(mddev, rdev);
-		r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
+		r10_bio->devs[slot].bio = IO_BLOCKED;
 		return;
 	}
 
 	while(sectors) {
 		int s = sectors;
-		int sl = r10_bio->read_slot;
+		int sl = slot;
 		int success = 0;
 		int start;
 
@@ -2783,7 +2783,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			sl++;
 			if (sl == conf->copies)
 				sl = 0;
-		} while (!success && sl != r10_bio->read_slot);
+		} while (!success && sl != slot);
 		rcu_read_unlock();
 
 		if (!success) {
@@ -2791,16 +2791,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			 * as bad on the first device to discourage future
 			 * reads.
 			 */
-			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
+			int dn = r10_bio->devs[slot].devnum;
 			rdev = conf->mirrors[dn].rdev;
 
 			if (!rdev_set_badblocks(
 				    rdev,
-				    r10_bio->devs[r10_bio->read_slot].addr
+				    r10_bio->devs[slot].addr
 				    + sect,
 				    s, 0)) {
 				md_error(mddev, rdev);
-				r10_bio->devs[r10_bio->read_slot].bio
+				r10_bio->devs[slot].bio
 					= IO_BLOCKED;
 			}
 			break;
@@ -2809,7 +2809,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 		start = sl;
 		/* write it back and re-read */
 		rcu_read_lock();
-		while (sl != r10_bio->read_slot) {
+		while (sl != slot) {
 			if (sl==0)
 				sl = conf->copies;
 			sl--;
@@ -2843,7 +2843,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			rcu_read_lock();
 		}
 		sl = start;
-		while (sl != r10_bio->read_slot) {
+		while (sl != slot) {
 			if (sl==0)
 				sl = conf->copies;
 			sl--;

From 02c67a3b72b13951c2ca134bd7065f03ec57946d Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 24 Jun 2023 01:32:35 +0800
Subject: [PATCH 050/113] md: remove redundant check in fix_read_error()

In fix_read_error(), 'success' will be checked immediately after assigning
it, if it is set to 1 then the loop will break. Checking it again in
condition of loop is redundant. Clean it up.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230623173236.2513554-3-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c  | 2 +-
 drivers/md/raid10.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f834d99a36f6d..a68c9cccbf0d2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2301,7 +2301,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
 			d++;
 			if (d == conf->raid_disks * 2)
 				d = 0;
-		} while (!success && d != read_disk);
+		} while (d != read_disk);
 
 		if (!success) {
 			/* Cannot read from anywhere - mark it bad */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index abea91a54db1d..757687fb90a7a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2783,7 +2783,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			sl++;
 			if (sl == conf->copies)
 				sl = 0;
-		} while (!success && sl != slot);
+		} while (sl != slot);
 		rcu_read_unlock();
 
 		if (!success) {

From b39f35ebe86d88788d85f61e83c81c308cb76727 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Wed, 28 Jun 2023 09:29:30 +0800
Subject: [PATCH 051/113] md: don't quiesce in mddev_suspend()

Some levels doesn't implement "pers->quiesce", for example
raid0_quiesce() is empty, and now that all levels will drop 'active_io'
until io is done, wait for 'active_io' to be 0 is enough to make sure all
normal io is done, and percpu_ref_kill() for 'active_io' will make sure
no new normal io can be dispatched. There is no need to call
"pers->quiesce" anymore from mddev_suspend().

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230628012931.88911-2-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index abb6167203938..962dacfd98cf8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -453,7 +453,6 @@ void mddev_suspend(struct mddev *mddev)
 		mddev->pers->prepare_suspend(mddev);
 
 	wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
-	mddev->pers->quiesce(mddev, 1);
 	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
 	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
 
@@ -472,7 +471,6 @@ void mddev_resume(struct mddev *mddev)
 		return;
 	percpu_ref_resurrect(&mddev->active_io);
 	wake_up(&mddev->sb_wait);
-	mddev->pers->quiesce(mddev, 0);
 
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);

From e24ed04389f9619e0aaef615a8948633c182a8b0 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Wed, 28 Jun 2023 09:29:31 +0800
Subject: [PATCH 052/113] md: restore 'noio_flag' for the last mddev_resume()

memalloc_noio_save() is called for the first mddev_suspend(), and
repeated mddev_suspend() only increase 'suspended'. However,
memalloc_noio_restore() is also called for the first mddev_resume(),
which means that memory reclaim will be enabled before the last
mddev_resume() is called, while the array is still suspended.

Fix this problem by restore 'noio_flag' for the last mddev_resume().

Fixes: 78f57ef9d50a ("md: use memalloc scope APIs in mddev_suspend()/mddev_resume()")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230628012931.88911-3-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 962dacfd98cf8..a3d98273b295c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -464,11 +464,13 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
 
 void mddev_resume(struct mddev *mddev)
 {
-	/* entred the memalloc scope from mddev_suspend() */
-	memalloc_noio_restore(mddev->noio_flag);
 	lockdep_assert_held(&mddev->reconfig_mutex);
 	if (--mddev->suspended)
 		return;
+
+	/* entred the memalloc scope from mddev_suspend() */
+	memalloc_noio_restore(mddev->noio_flag);
+
 	percpu_ref_resurrect(&mddev->active_io);
 	wake_up(&mddev->sb_wait);
 

From 21bd9a68fef47c4f0e951be9a6fac9745cee1bab Mon Sep 17 00:00:00 2001
From: Jack Wang <jinpu.wang@ionos.com>
Date: Wed, 5 Jul 2023 13:32:27 +0200
Subject: [PATCH 053/113] md/raid1: Avoid lock contention from wake_up()

wake_up is called unconditionally in a few paths such as make_request(),
which cause lock contention under high concurrency workload like below
    raid1_end_write_request
     wake_up
      __wake_up_common_lock
       spin_lock_irqsave

Improve performance by only call wake_up() if waitqueue is not empty

Fio test script:

[global]
name=random reads and writes
ioengine=libaio
direct=1
readwrite=randrw
rwmixread=70
iodepth=64
buffered=0
filename=/dev/md0
size=1G
runtime=30
time_based
randrepeat=0
norandommap
refill_buffers
ramp_time=10
bs=4k
numjobs=400
group_reporting=1
[job1]

Test result with 2 ramdisk in raid1 on a Intel Broadwell 56 cores server.

	Before this patch       With this patch
	READ	BW=4621MB/s	BW=7337MB/s
	WRITE	BW=1980MB/s	BW=3144MB/s

The patch is inspired by Yu Kuai's change for raid10:
https://lore.kernel.org/r/20230621105728.1268542-1-yukuai1@huaweicloud.com

Cc: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Jack Wang <jinpu.wang@ionos.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230705113227.148494-1-jinpu.wang@ionos.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a68c9cccbf0d2..23d211969565e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -789,11 +789,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	return best_disk;
 }
 
+static void wake_up_barrier(struct r1conf *conf)
+{
+	if (wq_has_sleeper(&conf->wait_barrier))
+		wake_up(&conf->wait_barrier);
+}
+
 static void flush_bio_list(struct r1conf *conf, struct bio *bio)
 {
 	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
 	raid1_prepare_flush_writes(conf->mddev->bitmap);
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
@@ -970,7 +976,7 @@ static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
 	 * In case freeze_array() is waiting for
 	 * get_unqueued_pending() == extra
 	 */
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 	/* Wait for the barrier in same barrier unit bucket to drop. */
 
 	/* Return false when nowait flag is set */
@@ -1013,7 +1019,7 @@ static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowa
 	 * In case freeze_array() is waiting for
 	 * get_unqueued_pending() == extra
 	 */
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 	/* Wait for array to be unfrozen */
 
 	/* Return false when nowait flag is set */
@@ -1042,7 +1048,7 @@ static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 static void _allow_barrier(struct r1conf *conf, int idx)
 {
 	atomic_dec(&conf->nr_pending[idx]);
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 }
 
 static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
@@ -1171,7 +1177,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		spin_lock_irq(&conf->device_lock);
 		bio_list_merge(&conf->pending_bio_list, &plug->pending);
 		spin_unlock_irq(&conf->device_lock);
-		wake_up(&conf->wait_barrier);
+		wake_up_barrier(conf);
 		md_wakeup_thread(mddev->thread);
 		kfree(plug);
 		return;
@@ -1574,7 +1580,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	r1_bio_write_done(r1_bio);
 
 	/* In case raid1d snuck in to freeze_array */
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 }
 
 static bool raid1_make_request(struct mddev *mddev, struct bio *bio)

From 7e85c41b9e1df9192c225afd7cfec8dcad137feb Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 1 Jul 2023 16:05:27 +0800
Subject: [PATCH 054/113] md/raid10: check replacement and rdev to prevent
 submit the same io twice

After commit 4ca40c2ce099 ("md/raid10: Allow replacement device to be
replace old drive."), 'rdev' and 'replacement' could appear to be
identical. There are already checks for that in wait_blocked_dev() and
raid10_write_request(). Add check for raid10_handle_discard() now.

Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/r/20230701080529.2684932-2-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid10.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 757687fb90a7a..60963449d3f50 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1785,6 +1785,8 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		r10_bio->devs[disk].bio = NULL;
 		r10_bio->devs[disk].repl_bio = NULL;
 
+		if (rdev == rrdev)
+			rrdev = NULL;
 		if (rdev && (test_bit(Faulty, &rdev->flags)))
 			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)))

From b99f8fd2d91eb734f13098aa1cf337edaca454b7 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 1 Jul 2023 16:05:28 +0800
Subject: [PATCH 055/113] md/raid10: factor out dereference_rdev_and_rrdev()

Factor out a helper to get 'rdev' and 'replacement' from config->mirrors.
Just to make code cleaner and prepare to fix the bug of io loss while
'replacement' replace 'rdev'.

There is no functional change.

Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/r/20230701080529.2684932-3-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid10.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 60963449d3f50..d21aeb9546d9f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1321,6 +1321,25 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	}
 }
 
+static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror,
+						  struct md_rdev **prrdev)
+{
+	struct md_rdev *rdev, *rrdev;
+
+	rrdev = rcu_dereference(mirror->replacement);
+	/*
+	 * Read replacement first to prevent reading both rdev and
+	 * replacement as NULL during replacement replace rdev.
+	 */
+	smp_mb();
+	rdev = rcu_dereference(mirror->rdev);
+	if (rdev == rrdev)
+		rrdev = NULL;
+
+	*prrdev = rrdev;
+	return rdev;
+}
+
 static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int i;
@@ -1464,15 +1483,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		int d = r10_bio->devs[i].devnum;
 		struct md_rdev *rdev, *rrdev;
 
-		rrdev = rcu_dereference(conf->mirrors[d].replacement);
-		/*
-		 * Read replacement first to prevent reading both rdev and
-		 * replacement as NULL during replacement replace rdev.
-		 */
-		smp_mb();
-		rdev = rcu_dereference(conf->mirrors[d].rdev);
-		if (rdev == rrdev)
-			rrdev = NULL;
+		rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev);
 		if (rdev && (test_bit(Faulty, &rdev->flags)))
 			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)))

From 673643490b9a0eb3b25633abe604f62b8f63dba1 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 1 Jul 2023 16:05:29 +0800
Subject: [PATCH 056/113] md/raid10: use dereference_rdev_and_rrdev() to get
 devices

Commit 2ae6aaf76912 ("md/raid10: fix io loss while replacement replace
rdev") reads replacement first to prevent io loss. However, there are same
issue in wait_blocked_dev() and raid10_handle_discard(), too. Fix it by
using dereference_rdev_and_rrdev() to get devices.

Fixes: d30588b2731f ("md/raid10: improve raid10 discard request")
Fixes: f2e7e269a752 ("md/raid10: pull the code that wait for blocked dev into one function")
Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/r/20230701080529.2684932-4-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid10.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d21aeb9546d9f..16aa9d735880a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1350,11 +1350,9 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	for (i = 0; i < conf->copies; i++) {
-		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		struct md_rdev *rrdev = rcu_dereference(
-			conf->mirrors[i].replacement);
-		if (rdev == rrdev)
-			rrdev = NULL;
+		struct md_rdev *rdev, *rrdev;
+
+		rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev);
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			atomic_inc(&rdev->nr_pending);
 			blocked_rdev = rdev;
@@ -1789,15 +1787,12 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 	 */
 	rcu_read_lock();
 	for (disk = 0; disk < geo->raid_disks; disk++) {
-		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
-		struct md_rdev *rrdev = rcu_dereference(
-			conf->mirrors[disk].replacement);
+		struct md_rdev *rdev, *rrdev;
 
+		rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev);
 		r10_bio->devs[disk].bio = NULL;
 		r10_bio->devs[disk].repl_bio = NULL;
 
-		if (rdev == rrdev)
-			rrdev = NULL;
 		if (rdev && (test_bit(Faulty, &rdev->flags)))
 			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)))

From b4d129640f194ffc4cc64c3e97f98ae944c072e8 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 6 Jul 2023 16:37:26 +0800
Subject: [PATCH 057/113] md/md-bitmap: remove unnecessary local variable in
 backlog_store()

Local variable is definied first in the beginning of backlog_store(),
there is no need to define it again.

Fixes: 8c13ab115b57 ("md/bitmap: don't set max_write_behind if there is no write mostly device")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230706083727.608914-2-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md-bitmap.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index a58a4c30265e6..1f33399611592 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2569,8 +2569,6 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 			mddev_destroy_serial_pool(mddev, NULL, false);
 	} else if (backlog && !mddev->serial_info_pool) {
 		/* serial_info_pool is needed since backlog is not zero */
-		struct md_rdev *rdev;
-
 		rdev_for_each(rdev, mddev)
 			mddev_create_serial_pool(mddev, rdev, false);
 	}

From 44abfa6a95df425c0660d56043020b67e6d93ab8 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 6 Jul 2023 16:37:27 +0800
Subject: [PATCH 058/113] md/md-bitmap: hold 'reconfig_mutex' in
 backlog_store()

Several reasons why 'reconfig_mutex' should be held:

1) rdev_for_each() is not safe to be called without the lock, because
   rdev can be removed concurrently.
2) mddev_destroy_serial_pool() and mddev_create_serial_pool() should not
   be called concurrently.
3) mddev_suspend() from mddev_destroy/create_serial_pool() should be
   protected by the lock.

Fixes: 10c92fca636e ("md-bitmap: create and destroy wb_info_pool with the change of backlog")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230706083727.608914-3-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md-bitmap.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 1f33399611592..6f9ff14971f98 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2546,6 +2546,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 	if (backlog > COUNTER_MAX)
 		return -EINVAL;
 
+	rv = mddev_lock(mddev);
+	if (rv)
+		return rv;
+
 	/*
 	 * Without write mostly device, it doesn't make sense to set
 	 * backlog for max_write_behind.
@@ -2559,6 +2563,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 	if (!has_write_mostly) {
 		pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n",
 				    mdname(mddev));
+		mddev_unlock(mddev);
 		return -EINVAL;
 	}
 
@@ -2574,6 +2579,8 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 	}
 	if (old_mwb != backlog)
 		md_bitmap_update_sb(mddev->bitmap);
+
+	mddev_unlock(mddev);
 	return len;
 }
 

From 4a8b719f95c0dcd15fb7a04b806ad8139fa7c850 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Aug 2023 19:21:56 +0200
Subject: [PATCH 059/113] fs: remove emergency_thaw_bdev

Fold emergency_thaw_bdev into it's only caller, to prepare for buffer.c
to be built only when buffer_head support is enabled.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20230801172201.1923299-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/buffer.c   | 6 ------
 fs/internal.h | 6 ------
 fs/super.c    | 4 +++-
 3 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index bd091329026c0..376f468e16662 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -562,12 +562,6 @@ static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 	return err;
 }
 
-void emergency_thaw_bdev(struct super_block *sb)
-{
-	while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
-		printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
-}
-
 /**
  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  * @mapping: the mapping which wants those buffers written
diff --git a/fs/internal.h b/fs/internal.h
index f7a3dc1110264..d538d832fd608 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,16 +23,10 @@ struct mnt_idmap;
  */
 #ifdef CONFIG_BLOCK
 extern void __init bdev_cache_init(void);
-
-void emergency_thaw_bdev(struct super_block *sb);
 #else
 static inline void bdev_cache_init(void)
 {
 }
-static inline int emergency_thaw_bdev(struct super_block *sb)
-{
-	return 0;
-}
 #endif /* CONFIG_BLOCK */
 
 /*
diff --git a/fs/super.c b/fs/super.c
index e781226e28800..bc666e7ee1a98 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1029,7 +1029,9 @@ static void do_thaw_all_callback(struct super_block *sb)
 {
 	down_write(&sb->s_umount);
 	if (sb->s_root && sb->s_flags & SB_BORN) {
-		emergency_thaw_bdev(sb);
+		if (IS_ENABLED(CONFIG_BLOCK))
+			while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
+				pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
 		thaw_super_locked(sb);
 	} else {
 		up_write(&sb->s_umount);

From 2ba39cc46bfe463cb9673bf62a04c4c21942f1f2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Aug 2023 19:21:57 +0200
Subject: [PATCH 060/113] fs: rename and move block_page_mkwrite_return

block_page_mkwrite_return is neither block nor mkwrite specific, and
should not be under CONFIG_BLOCK.  Move it to mm.h and rename it to
vmf_fs_error.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20230801172201.1923299-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/ext4/inode.c             |  2 +-
 fs/f2fs/file.c              |  2 +-
 fs/gfs2/file.c              | 16 ++++++++--------
 fs/iomap/buffered-io.c      |  2 +-
 fs/nilfs2/file.c            |  2 +-
 fs/udf/file.c               |  2 +-
 include/linux/buffer_head.h | 12 ------------
 include/linux/mm.h          | 18 ++++++++++++++++++
 8 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 43775a6ca5054..6eea0886b8855 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6140,7 +6140,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 	if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry_alloc;
 out_ret:
-	ret = block_page_mkwrite_return(err);
+	ret = vmf_fs_error(err);
 out:
 	filemap_invalidate_unlock_shared(mapping);
 	sb_end_pagefault(inode->i_sb);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 093039dee9920..9b3871fb9bfc4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -159,7 +159,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 
 	sb_end_pagefault(inode->i_sb);
 err:
-	return block_page_mkwrite_return(err);
+	return vmf_fs_error(err);
 }
 
 static const struct vm_operations_struct f2fs_file_vm_ops = {
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 1bf3c4453516f..897ef62d6d77a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -432,7 +432,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	err = gfs2_glock_nq(&gh);
 	if (err) {
-		ret = block_page_mkwrite_return(err);
+		ret = vmf_fs_error(err);
 		goto out_uninit;
 	}
 
@@ -474,7 +474,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 
 	err = gfs2_rindex_update(sdp);
 	if (err) {
-		ret = block_page_mkwrite_return(err);
+		ret = vmf_fs_error(err);
 		goto out_unlock;
 	}
 
@@ -482,12 +482,12 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 	ap.target = data_blocks + ind_blocks;
 	err = gfs2_quota_lock_check(ip, &ap);
 	if (err) {
-		ret = block_page_mkwrite_return(err);
+		ret = vmf_fs_error(err);
 		goto out_unlock;
 	}
 	err = gfs2_inplace_reserve(ip, &ap);
 	if (err) {
-		ret = block_page_mkwrite_return(err);
+		ret = vmf_fs_error(err);
 		goto out_quota_unlock;
 	}
 
@@ -500,7 +500,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 	}
 	err = gfs2_trans_begin(sdp, rblocks, 0);
 	if (err) {
-		ret = block_page_mkwrite_return(err);
+		ret = vmf_fs_error(err);
 		goto out_trans_fail;
 	}
 
@@ -508,7 +508,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 	if (gfs2_is_stuffed(ip)) {
 		err = gfs2_unstuff_dinode(ip);
 		if (err) {
-			ret = block_page_mkwrite_return(err);
+			ret = vmf_fs_error(err);
 			goto out_trans_end;
 		}
 	}
@@ -524,7 +524,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 
 	err = gfs2_allocate_page_backing(page, length);
 	if (err)
-		ret = block_page_mkwrite_return(err);
+		ret = vmf_fs_error(err);
 
 out_page_locked:
 	if (ret != VM_FAULT_LOCKED)
@@ -558,7 +558,7 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf)
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
 	err = gfs2_glock_nq(&gh);
 	if (err) {
-		ret = block_page_mkwrite_return(err);
+		ret = vmf_fs_error(err);
 		goto out_uninit;
 	}
 	ret = filemap_fault(vmf);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index adb92cdb24b00..0607790827b48 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1286,7 +1286,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
 	return VM_FAULT_LOCKED;
 out_unlock:
 	folio_unlock(folio);
-	return block_page_mkwrite_return(ret);
+	return vmf_fs_error(ret);
 }
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
 
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index a9eb3487efb2c..740ce26d1e765 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -108,7 +108,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
 	wait_for_stable_page(page);
  out:
 	sb_end_pagefault(inode->i_sb);
-	return block_page_mkwrite_return(ret);
+	return vmf_fs_error(ret);
 }
 
 static const struct vm_operations_struct nilfs_file_vm_ops = {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 243840dc83add..c0e2080e639ee 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -67,7 +67,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
 		err = block_commit_write(page, 0, end);
 	if (err < 0) {
 		unlock_page(page);
-		ret = block_page_mkwrite_return(err);
+		ret = vmf_fs_error(err);
 		goto out_unlock;
 	}
 out_dirty:
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6cb3e9af78c9e..7002a9ff63a3d 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -291,18 +291,6 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
-/* Convert errno to return value from ->page_mkwrite() call */
-static inline vm_fault_t block_page_mkwrite_return(int err)
-{
-	if (err == 0)
-		return VM_FAULT_LOCKED;
-	if (err == -EFAULT || err == -EAGAIN)
-		return VM_FAULT_NOPAGE;
-	if (err == -ENOMEM)
-		return VM_FAULT_OOM;
-	/* -ENOSPC, -EDQUOT, -EIO ... */
-	return VM_FAULT_SIGBUS;
-}
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2dd73e4f3d8e3..75777eae1c9c2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3386,6 +3386,24 @@ static inline vm_fault_t vmf_error(int err)
 	return VM_FAULT_SIGBUS;
 }
 
+/*
+ * Convert errno to return value for ->page_mkwrite() calls.
+ *
+ * This should eventually be merged with vmf_error() above, but will need a
+ * careful audit of all vmf_error() callers.
+ */
+static inline vm_fault_t vmf_fs_error(int err)
+{
+	if (err == 0)
+		return VM_FAULT_LOCKED;
+	if (err == -EFAULT || err == -EAGAIN)
+		return VM_FAULT_NOPAGE;
+	if (err == -ENOMEM)
+		return VM_FAULT_OOM;
+	/* -ENOSPC, -EDQUOT, -EIO ... */
+	return VM_FAULT_SIGBUS;
+}
+
 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			 unsigned int foll_flags);
 

From 727cfe976758b79f8d2f8051c75a5ccb14539a56 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Aug 2023 19:21:58 +0200
Subject: [PATCH 061/113] block: open code __generic_file_write_iter for blkdev
 writes

Open code __generic_file_write_iter to remove the indirect call into
->direct_IO and to prepare using the iomap based write code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/20230801172201.1923299-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/fops.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index a286bf3325c5d..8a05d99166e3b 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -533,6 +533,30 @@ static int blkdev_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static ssize_t
+blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	size_t count = iov_iter_count(from);
+	ssize_t written;
+
+	written = kiocb_invalidate_pages(iocb, count);
+	if (written) {
+		if (written == -EBUSY)
+			return 0;
+		return written;
+	}
+
+	written = blkdev_direct_IO(iocb, from);
+	if (written > 0) {
+		kiocb_invalidate_post_direct_write(iocb, count);
+		iocb->ki_pos += written;
+		count -= written;
+	}
+	if (written != -EIOCBQUEUED)
+		iov_iter_revert(from, count - iov_iter_count(from));
+	return written;
+}
+
 /*
  * Write data to the block device.  Only intended for the block device itself
  * and the raw driver which basically is a fake block device.
@@ -542,7 +566,8 @@ static int blkdev_release(struct inode *inode, struct file *filp)
  */
 static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+	struct file *file = iocb->ki_filp;
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	struct inode *bd_inode = bdev->bd_inode;
 	loff_t size = bdev_nr_bytes(bdev);
 	size_t shorted = 0;
@@ -569,7 +594,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		iov_iter_truncate(from, size);
 	}
 
-	ret = __generic_file_write_iter(iocb, from);
+	ret = file_remove_privs(file);
+	if (ret)
+		return ret;
+
+	ret = file_update_time(file);
+	if (ret)
+		return ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = blkdev_direct_write(iocb, from);
+		if (ret >= 0 && iov_iter_count(from))
+			ret = direct_write_fallback(iocb, from, ret,
+					generic_perform_write(iocb, from));
+	} else {
+		ret = generic_perform_write(iocb, from);
+	}
+
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
 	iov_iter_reexpand(from, iov_iter_count(from) + shorted);

From a05f7bd9578b17521a9a5f3689f3934c082c6390 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Aug 2023 19:21:59 +0200
Subject: [PATCH 062/113] block: stop setting ->direct_IO

Direct I/O on block devices now nevers goes through aops->direct_IO.
Stop setting it and set the FMODE_CAN_ODIRECT in ->open instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20230801172201.1923299-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/fops.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index 8a05d99166e3b..f0b822c28ddfe 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -428,7 +428,6 @@ const struct address_space_operations def_blk_aops = {
 	.writepage	= blkdev_writepage,
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
-	.direct_IO	= blkdev_direct_IO,
 	.migrate_folio	= buffer_migrate_folio_norefs,
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
@@ -505,7 +504,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 	 * during an unstable branch.
 	 */
 	filp->f_flags |= O_LARGEFILE;
-	filp->f_mode |= FMODE_BUF_RASYNC;
+	filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
 
 	/*
 	 * Use the file private data to store the holder for exclusive openes.

From 487c607df790d366e67a7d6a30adf785cdd98e55 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Aug 2023 19:22:00 +0200
Subject: [PATCH 063/113] block: use iomap for writes to block devices

Use iomap in buffer_head compat mode to write to block devices.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20230801172201.1923299-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig |  1 +
 block/fops.c  | 31 +++++++++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 86122e459fe04..1a13ef0b1ca10 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,6 +5,7 @@
 menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
+       select FS_IOMAP
        select SBITMAP
        help
 	 Provide block layer support for the kernel.
diff --git a/block/fops.c b/block/fops.c
index f0b822c28ddfe..063ece37d44e4 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -15,6 +15,7 @@
 #include <linux/falloc.h>
 #include <linux/suspend.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/module.h>
 #include "blk.h"
 
@@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
 }
 
+static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+	struct block_device *bdev = I_BDEV(inode);
+	loff_t isize = i_size_read(inode);
+
+	iomap->bdev = bdev;
+	iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
+	if (iomap->offset >= isize)
+		return -EIO;
+	iomap->type = IOMAP_MAPPED;
+	iomap->addr = iomap->offset;
+	iomap->length = isize - iomap->offset;
+	iomap->flags |= IOMAP_F_BUFFER_HEAD;
+	return 0;
+}
+
+static const struct iomap_ops blkdev_iomap_ops = {
+	.iomap_begin		= blkdev_iomap_begin,
+};
+
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
@@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	return written;
 }
 
+static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
+}
+
 /*
  * Write data to the block device.  Only intended for the block device itself
  * and the raw driver which basically is a fake block device.
@@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		ret = blkdev_direct_write(iocb, from);
 		if (ret >= 0 && iov_iter_count(from))
 			ret = direct_write_fallback(iocb, from, ret,
-					generic_perform_write(iocb, from));
+					blkdev_buffered_write(iocb, from));
 	} else {
-		ret = generic_perform_write(iocb, from);
+		ret = blkdev_buffered_write(iocb, from);
 	}
 
 	if (ret > 0)

From 925c86a19bacf8ce10eb666328fb3fa5aff7b951 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Aug 2023 19:22:01 +0200
Subject: [PATCH 064/113] fs: add CONFIG_BUFFER_HEAD

Add a new config option that controls building the buffer_head code, and
select it from all file systems and stacking drivers that need it.

For the block device nodes and alternative iomap based buffered I/O path
is provided when buffer_head support is not enabled, and iomap needs a
a small tweak to define the IOMAP_F_BUFFER_HEAD flag to 0 to not call
into the buffer_head code when it doesn't exist.

Otherwise this is just Kconfig and ifdef changes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20230801172201.1923299-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/fops.c                 | 70 ++++++++++++++++++++++++++++++------
 drivers/md/Kconfig           |  1 +
 fs/Kconfig                   |  4 +++
 fs/Makefile                  |  2 +-
 fs/adfs/Kconfig              |  1 +
 fs/affs/Kconfig              |  1 +
 fs/befs/Kconfig              |  1 +
 fs/bfs/Kconfig               |  1 +
 fs/efs/Kconfig               |  1 +
 fs/exfat/Kconfig             |  1 +
 fs/ext2/Kconfig              |  1 +
 fs/ext4/Kconfig              |  1 +
 fs/f2fs/Kconfig              |  1 +
 fs/fat/Kconfig               |  1 +
 fs/freevxfs/Kconfig          |  1 +
 fs/gfs2/Kconfig              |  1 +
 fs/hfs/Kconfig               |  1 +
 fs/hfsplus/Kconfig           |  1 +
 fs/hpfs/Kconfig              |  1 +
 fs/isofs/Kconfig             |  1 +
 fs/jfs/Kconfig               |  1 +
 fs/minix/Kconfig             |  1 +
 fs/nilfs2/Kconfig            |  1 +
 fs/ntfs/Kconfig              |  1 +
 fs/ntfs3/Kconfig             |  1 +
 fs/ocfs2/Kconfig             |  1 +
 fs/omfs/Kconfig              |  1 +
 fs/qnx4/Kconfig              |  1 +
 fs/qnx6/Kconfig              |  1 +
 fs/reiserfs/Kconfig          |  1 +
 fs/sysv/Kconfig              |  1 +
 fs/udf/Kconfig               |  1 +
 fs/ufs/Kconfig               |  1 +
 include/linux/buffer_head.h  | 32 ++++++++---------
 include/linux/iomap.h        |  4 +++
 include/trace/events/block.h |  2 ++
 mm/migrate.c                 |  4 +--
 37 files changed, 119 insertions(+), 29 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index 063ece37d44e4..eaa98a987213d 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -24,15 +24,6 @@ static inline struct inode *bdev_file_inode(struct file *file)
 	return file->f_mapping->host;
 }
 
-static int blkdev_get_block(struct inode *inode, sector_t iblock,
-		struct buffer_head *bh, int create)
-{
-	bh->b_bdev = I_BDEV(inode);
-	bh->b_blocknr = iblock;
-	set_buffer_mapped(bh);
-	return 0;
-}
-
 static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
 {
 	blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
@@ -400,7 +391,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 	iomap->type = IOMAP_MAPPED;
 	iomap->addr = iomap->offset;
 	iomap->length = isize - iomap->offset;
-	iomap->flags |= IOMAP_F_BUFFER_HEAD;
+	iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
 	return 0;
 }
 
@@ -408,6 +399,16 @@ static const struct iomap_ops blkdev_iomap_ops = {
 	.iomap_begin		= blkdev_iomap_begin,
 };
 
+#ifdef CONFIG_BUFFER_HEAD
+static int blkdev_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh, int create)
+{
+	bh->b_bdev = I_BDEV(inode);
+	bh->b_blocknr = iblock;
+	set_buffer_mapped(bh);
+	return 0;
+}
+
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
@@ -453,6 +454,55 @@ const struct address_space_operations def_blk_aops = {
 	.migrate_folio	= buffer_migrate_folio_norefs,
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
+#else /* CONFIG_BUFFER_HEAD */
+static int blkdev_read_folio(struct file *file, struct folio *folio)
+{
+	return iomap_read_folio(folio, &blkdev_iomap_ops);
+}
+
+static void blkdev_readahead(struct readahead_control *rac)
+{
+	iomap_readahead(rac, &blkdev_iomap_ops);
+}
+
+static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
+		struct inode *inode, loff_t offset)
+{
+	loff_t isize = i_size_read(inode);
+
+	if (WARN_ON_ONCE(offset >= isize))
+		return -EIO;
+	if (offset >= wpc->iomap.offset &&
+	    offset < wpc->iomap.offset + wpc->iomap.length)
+		return 0;
+	return blkdev_iomap_begin(inode, offset, isize - offset,
+				  IOMAP_WRITE, &wpc->iomap, NULL);
+}
+
+static const struct iomap_writeback_ops blkdev_writeback_ops = {
+	.map_blocks		= blkdev_map_blocks,
+};
+
+static int blkdev_writepages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	struct iomap_writepage_ctx wpc = { };
+
+	return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+}
+
+const struct address_space_operations def_blk_aops = {
+	.dirty_folio	= filemap_dirty_folio,
+	.release_folio		= iomap_release_folio,
+	.invalidate_folio	= iomap_invalidate_folio,
+	.read_folio		= blkdev_read_folio,
+	.readahead		= blkdev_readahead,
+	.writepages		= blkdev_writepages,
+	.is_partially_uptodate  = iomap_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+	.migrate_folio		= filemap_migrate_folio,
+};
+#endif /* CONFIG_BUFFER_HEAD */
 
 /*
  * for a block special file file_inode(file)->i_size is zero
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 444517d1a2336..2a8b081bce7dd 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -15,6 +15,7 @@ if MD
 config BLK_DEV_MD
 	tristate "RAID support"
 	select BLOCK_HOLDER_DEPRECATED if SYSFS
+	select BUFFER_HEAD
 	# BLOCK_LEGACY_AUTOLOAD requirement should be removed
 	# after relevant mdadm enhancements - to make "names=yes"
 	# the default - are widely available.
diff --git a/fs/Kconfig b/fs/Kconfig
index 18d034ec79539..e8b17c81b83a8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER
 config FS_IOMAP
 	bool
 
+config BUFFER_HEAD
+	bool
+
 # old blockdev_direct_IO implementation.  Use iomap for new code instead
 config LEGACY_DIRECT_IO
+	depends on BUFFER_HEAD
 	bool
 
 if BLOCK
diff --git a/fs/Makefile b/fs/Makefile
index e513aaee0603a..f9541f40be4e0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -17,7 +17,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
 		kernel_read_file.o mnt_idmapping.o remap_range.o
 
-obj-$(CONFIG_BLOCK)		+= buffer.o mpage.o
+obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
 obj-$(CONFIG_LEGACY_DIRECT_IO)	+= direct-io.o
 obj-y				+= notify/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 44738fed66251..1b97058f0c4a9 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -2,6 +2,7 @@
 config ADFS_FS
 	tristate "ADFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index 962b86374e1c1..1ae432d266c32 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -2,6 +2,7 @@
 config AFFS_FS
 	tristate "Amiga FFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select LEGACY_DIRECT_IO
 	help
 	  The Fast File System (FFS) is the common file system used on hard
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 9550b6462b814..5fcfc4024ffe6 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -2,6 +2,7 @@
 config BEFS_FS
 	tristate "BeOS file system (BeFS) support (read only)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select NLS
 	help
 	  The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index 3a757805b5856..8e7ef866b62a6 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -2,6 +2,7 @@
 config BFS_FS
 	tristate "BFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  Boot File System (BFS) is a file system used under SCO UnixWare to
 	  allow the bootloader access to the kernel image and other important
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index 2df1bac8b375b..0833e533df9d5 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig
@@ -2,6 +2,7 @@
 config EFS_FS
 	tristate "EFS file system support (read only)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
 	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig
index 147edeb044691..cbeca8e44d9b3 100644
--- a/fs/exfat/Kconfig
+++ b/fs/exfat/Kconfig
@@ -2,6 +2,7 @@
 
 config EXFAT_FS
 	tristate "exFAT filesystem support"
+	select BUFFER_HEAD
 	select NLS
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 77393fda99af0..74d98965902e1 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config EXT2_FS
 	tristate "Second extended fs support"
+	select BUFFER_HEAD
 	select FS_IOMAP
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 86699c8cab281..e20d59221fc05 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -28,6 +28,7 @@ config EXT3_FS_SECURITY
 
 config EXT4_FS
 	tristate "The Extended 4 (ext4) filesystem"
+	select BUFFER_HEAD
 	select JBD2
 	select CRC16
 	select CRYPTO
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 03ef087537c7c..68a1e23e1557c 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -2,6 +2,7 @@
 config F2FS_FS
 	tristate "F2FS filesystem support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select NLS
 	select CRYPTO
 	select CRYPTO_CRC32
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index afe83b4e71728..25fae1c83725b 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config FAT_FS
 	tristate
+	select BUFFER_HEAD
 	select NLS
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
index 0e2fc08f7de49..912107ebea6f4 100644
--- a/fs/freevxfs/Kconfig
+++ b/fs/freevxfs/Kconfig
@@ -2,6 +2,7 @@
 config VXFS_FS
 	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
 	  file system format.  VERITAS VxFS(TM) is the standard file system
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 03c966840422e..be7f87a8e11ae 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config GFS2_FS
 	tristate "GFS2 file system support"
+	select BUFFER_HEAD
 	select FS_POSIX_ACL
 	select CRC32
 	select LIBCRC32C
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
index d985066006d58..5ea5cd8ecea9c 100644
--- a/fs/hfs/Kconfig
+++ b/fs/hfs/Kconfig
@@ -2,6 +2,7 @@
 config HFS_FS
 	tristate "Apple Macintosh file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select NLS
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index 8034e7827a690..8ce4a33a9ac78 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -2,6 +2,7 @@
 config HFSPLUS_FS
 	tristate "Apple Extended HFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select NLS
 	select NLS_UTF8
 	select LEGACY_DIRECT_IO
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index ec975f4668775..ac1e9318e65a4 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -2,6 +2,7 @@
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select FS_IOMAP
 	help
 	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
index 08ffd37b9bb8f..51434f2a471b0 100644
--- a/fs/isofs/Kconfig
+++ b/fs/isofs/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config ISO9660_FS
 	tristate "ISO 9660 CDROM file system support"
+	select BUFFER_HEAD
 	help
 	  This is the standard file system used on CD-ROMs.  It was previously
 	  known as "High Sierra File System" and is called "hsfs" on other
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
index 51e856f0e4b8d..17488440eef1a 100644
--- a/fs/jfs/Kconfig
+++ b/fs/jfs/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config JFS_FS
 	tristate "JFS filesystem support"
+	select BUFFER_HEAD
 	select NLS
 	select CRC32
 	select LEGACY_DIRECT_IO
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index de2003974ff0d..90ddfad2a75e8 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -2,6 +2,7 @@
 config MINIX_FS
 	tristate "Minix file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  Minix is a simple operating system used in many classes about OS's.
 	  The minix file system (method to organize files on a hard disk
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 7d59567465e12..7dae168e346e3 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config NILFS2_FS
 	tristate "NILFS2 file system support"
+	select BUFFER_HEAD
 	select CRC32
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
index f93e69a612833..7b2509741735a 100644
--- a/fs/ntfs/Kconfig
+++ b/fs/ntfs/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config NTFS_FS
 	tristate "NTFS file system support"
+	select BUFFER_HEAD
 	select NLS
 	help
 	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig
index 96cc236f7f7bd..cdfdf51e55d79 100644
--- a/fs/ntfs3/Kconfig
+++ b/fs/ntfs3/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config NTFS3_FS
 	tristate "NTFS Read-Write file system support"
+	select BUFFER_HEAD
 	select NLS
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 3123da7cfb301..2514d36cbe015 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -2,6 +2,7 @@
 config OCFS2_FS
 	tristate "OCFS2 file system support"
 	depends on INET && SYSFS && CONFIGFS_FS
+	select BUFFER_HEAD
 	select JBD2
 	select CRC32
 	select QUOTA
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
index 42b2ec35a05bf..8470f6c3e64e6 100644
--- a/fs/omfs/Kconfig
+++ b/fs/omfs/Kconfig
@@ -2,6 +2,7 @@
 config OMFS_FS
 	tristate "SonicBlue Optimized MPEG File System support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	select CRC_ITU_T
 	help
 	  This is the proprietary file system used by the Rio Karma music
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index 45b5b98376c43..a2eb826e76c60 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig
@@ -2,6 +2,7 @@
 config QNX4FS_FS
 	tristate "QNX4 file system support (read only)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  This is the file system used by the real-time operating systems
 	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig
index 6a9d6bce15862..8e865d72204e7 100644
--- a/fs/qnx6/Kconfig
+++ b/fs/qnx6/Kconfig
@@ -2,6 +2,7 @@
 config QNX6FS_FS
 	tristate "QNX6 file system support (read only)"
 	depends on BLOCK && CRC32
+	select BUFFER_HEAD
 	help
 	  This is the file system used by the real-time operating systems
 	  QNX 6 (also called QNX RTP).
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 4d22ecfe0fab6..0e6fe26458fed 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config REISERFS_FS
 	tristate "Reiserfs support (deprecated)"
+	select BUFFER_HEAD
 	select CRC32
 	select LEGACY_DIRECT_IO
 	help
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
index b4e23e03fbeba..67b3f90afbfd6 100644
--- a/fs/sysv/Kconfig
+++ b/fs/sysv/Kconfig
@@ -2,6 +2,7 @@
 config SYSV_FS
 	tristate "System V/Xenix/V7/Coherent file system support"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  SCO, Xenix and Coherent are commercial Unix systems for Intel
 	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 82e8bfa2dfd98..8f7ce30d47fdc 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config UDF_FS
 	tristate "UDF file system support"
+	select BUFFER_HEAD
 	select CRC_ITU_T
 	select NLS
 	select LEGACY_DIRECT_IO
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 6d30adb6b890f..9301e7ecd0921 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -2,6 +2,7 @@
 config UFS_FS
 	tristate "UFS file system support (read only)"
 	depends on BLOCK
+	select BUFFER_HEAD
 	help
 	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
 	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7002a9ff63a3d..c89ef50d5112f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -16,8 +16,6 @@
 #include <linux/wait.h>
 #include <linux/atomic.h>
 
-#ifdef CONFIG_BLOCK
-
 enum bh_state_bits {
 	BH_Uptodate,	/* Contains valid data */
 	BH_Dirty,	/* Is dirty */
@@ -198,7 +196,6 @@ void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
 void folio_set_bh(struct buffer_head *bh, struct folio *folio,
 		  unsigned long offset);
-bool try_to_free_buffers(struct folio *);
 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
 					bool retry);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
@@ -213,10 +210,6 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate);
 
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
-int inode_has_buffers(struct inode *);
-void invalidate_inode_buffers(struct inode *);
-int remove_inode_buffers(struct inode *inode);
-int sync_mapping_buffers(struct address_space *mapping);
 int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
 				  bool datasync);
 int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
@@ -240,9 +233,6 @@ void __bforget(struct buffer_head *);
 void __breadahead(struct block_device *, sector_t block, unsigned int size);
 struct buffer_head *__bread_gfp(struct block_device *,
 				sector_t block, unsigned size, gfp_t gfp);
-void invalidate_bh_lrus(void);
-void invalidate_bh_lrus_cpu(void);
-bool has_bh_in_lru(int cpu, void *dummy);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
@@ -258,8 +248,6 @@ int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
 void __bh_read_batch(int nr, struct buffer_head *bhs[],
 		     blk_opf_t op_flags, bool force_lock);
 
-extern int buffer_heads_over_limit;
-
 /*
  * Generic address_space_operations implementations for buffer_head-backed
  * address_spaces.
@@ -304,8 +292,6 @@ extern int buffer_migrate_folio_norefs(struct address_space *,
 #define buffer_migrate_folio_norefs NULL
 #endif
 
-void buffer_init(void);
-
 /*
  * inline definitions
  */
@@ -465,7 +451,20 @@ __bread(struct block_device *bdev, sector_t block, unsigned size)
 
 bool block_dirty_folio(struct address_space *mapping, struct folio *folio);
 
-#else /* CONFIG_BLOCK */
+#ifdef CONFIG_BUFFER_HEAD
+
+void buffer_init(void);
+bool try_to_free_buffers(struct folio *folio);
+int inode_has_buffers(struct inode *inode);
+void invalidate_inode_buffers(struct inode *inode);
+int remove_inode_buffers(struct inode *inode);
+int sync_mapping_buffers(struct address_space *mapping);
+void invalidate_bh_lrus(void);
+void invalidate_bh_lrus_cpu(void);
+bool has_bh_in_lru(int cpu, void *dummy);
+extern int buffer_heads_over_limit;
+
+#else /* CONFIG_BUFFER_HEAD */
 
 static inline void buffer_init(void) {}
 static inline bool try_to_free_buffers(struct folio *folio) { return true; }
@@ -473,9 +472,10 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bh_lrus(void) {}
 static inline void invalidate_bh_lrus_cpu(void) {}
 static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
 #define buffer_heads_over_limit 0
 
-#endif /* CONFIG_BLOCK */
+#endif /* CONFIG_BUFFER_HEAD */
 #endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119a..54f50d34fd9d4 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -58,7 +58,11 @@ struct vm_fault;
 #define IOMAP_F_DIRTY		(1U << 1)
 #define IOMAP_F_SHARED		(1U << 2)
 #define IOMAP_F_MERGED		(1U << 3)
+#ifdef CONFIG_BUFFER_HEAD
 #define IOMAP_F_BUFFER_HEAD	(1U << 4)
+#else
+#define IOMAP_F_BUFFER_HEAD	0
+#endif /* CONFIG_BUFFER_HEAD */
 #define IOMAP_F_XATTR		(1U << 5)
 
 /*
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 40e60c33cc6f3..0e128ad514601 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -12,6 +12,7 @@
 
 #define RWBS_LEN	8
 
+#ifdef CONFIG_BUFFER_HEAD
 DECLARE_EVENT_CLASS(block_buffer,
 
 	TP_PROTO(struct buffer_head *bh),
@@ -61,6 +62,7 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,
 
 	TP_ARGS(bh)
 );
+#endif /* CONFIG_BUFFER_HEAD */
 
 /**
  * block_rq_requeue - place block IO request back on a queue
diff --git a/mm/migrate.c b/mm/migrate.c
index 24baad2571e31..fe6f8d454aff8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst,
 }
 EXPORT_SYMBOL(migrate_folio);
 
-#ifdef CONFIG_BLOCK
+#ifdef CONFIG_BUFFER_HEAD
 /* Returns true if all buffers are successfully locked */
 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
 							enum migrate_mode mode)
@@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping,
 	return __buffer_migrate_folio(mapping, dst, src, mode, true);
 }
 EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
-#endif
+#endif /* CONFIG_BUFFER_HEAD */
 
 int filemap_migrate_folio(struct address_space *mapping,
 		struct folio *dst, struct folio *src, enum migrate_mode mode)

From a24c8b5111a139b54fd02fec23bc21789135cd2e Mon Sep 17 00:00:00 2001
From: Li Zetao <lizetao1@huawei.com>
Date: Thu, 3 Aug 2023 21:07:38 +0800
Subject: [PATCH 065/113] fs/Kconfig: Fix compile error for romfs

There are some compile errors reported by kernel test robot:

arm-linux-gnueabi-ld: fs/romfs/storage.o: in function `romfs_dev_read':
storage.c:(.text+0x64): undefined reference to `__brelse'
arm-linux-gnueabi-ld: storage.c:(.text+0x9c): undefined reference to `__bread_gfp'
arm-linux-gnueabi-ld: fs/romfs/storage.o: in function `romfs_dev_strnlen':
storage.c:(.text+0x128): undefined reference to `__brelse'
arm-linux-gnueabi-ld: storage.c:(.text+0x16c): undefined reference to `__bread_gfp'
arm-linux-gnueabi-ld: fs/romfs/storage.o: in function `romfs_dev_strcmp':
storage.c:(.text+0x22c): undefined reference to `__bread_gfp'
arm-linux-gnueabi-ld: storage.c:(.text+0x27c): undefined reference to `__brelse'
arm-linux-gnueabi-ld: storage.c:(.text+0x2a8): undefined reference to `__bread_gfp'
arm-linux-gnueabi-ld: storage.c:(.text+0x2bc): undefined reference to `__brelse'
arm-linux-gnueabi-ld: storage.c:(.text+0x2d4): undefined reference to `__brelse'
arm-linux-gnueabi-ld: storage.c:(.text+0x2f4): undefined reference to `__brelse'
arm-linux-gnueabi-ld: storage.c:(.text+0x304): undefined reference to `__brelse'

The reason for the problem is that the commit
"925c86a19bac" ("fs: add CONFIG_BUFFER_HEAD") has added a new config
"CONFIG_BUFFER_HEAD" that controls building the buffer_head code, and
romfs needs to use the buffer_head API, but no corresponding config has
beed added. Select the config "CONFIG_BUFFER_HEAD" in romfs Kconfig to
resolve the problem.

Fixes: 925c86a19bac ("fs: add CONFIG_BUFFER_HEAD")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202308031810.pQzGmR1v-lkp@intel.com/
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Tested-by: Li Zetao <lizetao1@huawei.com>
Signed-off-by: Li Zetao <lizetao1@huawei.com>
[axboe: fold in Christoph's incremental]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/romfs/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 8eb87008b55ac..f24a96a331af1 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -57,6 +57,7 @@ endchoice
 config ROMFS_ON_BLOCK
 	bool
 	default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+	select BUFFER_HEAD
 
 config ROMFS_ON_MTD
 	bool

From 2eae9c4912b6cfdfadcd4fa8ac26879e18a504a1 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 4 Aug 2023 14:50:37 +0800
Subject: [PATCH 066/113] iocost_monitor: fix kernel queue kobj changes

When I use iocost_monitor on nvme0n1, this error shows up:
"Could not find ioc for nvme0n1"

There is no kobj in struct queue in recent kernel, it seems that the commit
2bd85221a625 ("block: untangle request_queue refcounting from sysfs")
move the queue kobj to struct gendisk.

Fix it by using mq_kobj which is at the same level with queue kobj.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Link: https://lore.kernel.org/r/20230804065039.8885-1-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/cgroup/iocost_monitor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py
index 0dbbc67400fcb..7aa076cb559e0 100644
--- a/tools/cgroup/iocost_monitor.py
+++ b/tools/cgroup/iocost_monitor.py
@@ -221,7 +221,7 @@ def table_row_str(self, path):
 for i, ptr in radix_tree_for_each(blkcg_root.blkg_tree.address_of_()):
     blkg = drgn.Object(prog, 'struct blkcg_gq', address=ptr)
     try:
-        if devname == blkg.q.kobj.parent.name.string_().decode('utf-8'):
+        if devname == blkg.q.mq_kobj.parent.name.string_().decode('utf-8'):
             q_id = blkg.q.id.value_()
             if blkg.pd[plid]:
                 root_iocg = container_of(blkg.pd[plid], 'struct ioc_gq', 'pd')

From 8e93c1acd15e6a754c19ef12f6e69641f37e267a Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 4 Aug 2023 14:50:38 +0800
Subject: [PATCH 067/113] iocost_monitor: print vrate inuse along with
 base_vrate

The real vrate iocost inuse is not base_vrate, but the atomic vtime_rate.
We need iocost_monitor tool to display this real vrate that iocost use,
to check if the boosted compensated vrate is normal.

Effect after change:

nvme0n1 RUN  per=50.0ms cur_per=172116.580:v1040587.433 busy= +0 \
vrate=135.00%:270.00% params=ssd_dfl(CQ)
                ^
                |
         this is real vrate inuse

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Link: https://lore.kernel.org/r/20230804065039.8885-2-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/cgroup/iocost_monitor.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py
index 7aa076cb559e0..52ae9d1595b21 100644
--- a/tools/cgroup/iocost_monitor.py
+++ b/tools/cgroup/iocost_monitor.py
@@ -100,6 +100,7 @@ def __init__(self, ioc):
         self.period_at = ioc.period_at.value_() / 1_000_000
         self.vperiod_at = ioc.period_at_vtime.value_() / VTIME_PER_SEC
         self.vrate_pct = ioc.vtime_base_rate.value_() * 100 / VTIME_PER_USEC
+        self.ivrate_pct = ioc.vtime_rate.counter.value_() * 100 / VTIME_PER_USEC
         self.busy_level = ioc.busy_level.value_()
         self.autop_idx = ioc.autop_idx.value_()
         self.user_cost_model = ioc.user_cost_model.value_()
@@ -119,7 +120,9 @@ def dict(self, now):
                  'period_at'            : self.period_at,
                  'period_vtime_at'      : self.vperiod_at,
                  'busy_level'           : self.busy_level,
-                 'vrate_pct'            : self.vrate_pct, }
+                 'vrate_pct'            : self.vrate_pct,
+                 'ivrate_pct'           : self.ivrate_pct,
+                }
 
     def table_preamble_str(self):
         state = ('RUN' if self.running else 'IDLE') if self.enabled else 'OFF'
@@ -127,7 +130,7 @@ def table_preamble_str(self):
                  f'per={self.period_ms}ms ' \
                  f'cur_per={self.period_at:.3f}:v{self.vperiod_at:.3f} ' \
                  f'busy={self.busy_level:+3} ' \
-                 f'vrate={self.vrate_pct:6.2f}% ' \
+                 f'vrate={self.vrate_pct:6.2f}%:{self.ivrate_pct:6.2f}% ' \
                  f'params={self.autop_name}'
         if self.user_cost_model or self.user_qos_params:
             output += f'({"C" if self.user_cost_model else ""}{"Q" if self.user_qos_params else ""})'

From 68392b002023cb6dadd3d5044268470a7201b313 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 4 Aug 2023 14:50:39 +0800
Subject: [PATCH 068/113] iocost_monitor: improve it by adding iocg wait_ms

The iocg can have three throttled metrics: wait, debt, delay. This patch
add missing wait_ms to IocgStat to show the latest wait_ms of iocg.

As we are here, group iocg usage percents "inflt%" and "usage%" together,
and group iocg throttled metrics "wait", "debt" and "delay" together.

Effect after changes:

nvme0n1 RUN  per=50.0ms cur_per=177105.713:v1053528.587 busy= +0 vrate=135.00%:270.00% params=ssd_dfl(CQ)
                          active    weight      hweight% inflt% usage%    wait    debt   delay
InterfererGroup0             *   100/  100  54.28/  9.09   0.34  24.07    0.00    0.00    0.00
interfered                   *    84/ 1000  45.72/ 90.91   0.48  41.09    0.00    0.00    0.00

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20230804065039.8885-3-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/cgroup/iocost_monitor.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py
index 52ae9d1595b21..933c750b319b7 100644
--- a/tools/cgroup/iocost_monitor.py
+++ b/tools/cgroup/iocost_monitor.py
@@ -138,7 +138,7 @@ def table_preamble_str(self):
 
     def table_header_str(self):
         return f'{"":25} active {"weight":>9} {"hweight%":>13} {"inflt%":>6} ' \
-               f'{"debt":>7} {"delay":>7} {"usage%"}'
+               f'{"usage%":>6} {"wait":>7} {"debt":>7} {"delay":>7}'
 
 class IocgStat:
     def __init__(self, iocg):
@@ -164,6 +164,8 @@ def __init__(self, iocg):
 
         self.usage = (100 * iocg.usage_delta_us.value_() /
                       ioc.period_us.value_()) if self.active else 0
+        self.wait_ms = (iocg.stat.wait_us.value_() -
+                        iocg.last_stat.wait_us.value_()) / 1000
         self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000
         if blkg.use_delay.counter.value_() != 0:
             self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000
@@ -180,9 +182,10 @@ def dict(self, now, path):
                 'hweight_active_pct'    : self.hwa_pct,
                 'hweight_inuse_pct'     : self.hwi_pct,
                 'inflight_pct'          : self.inflight_pct,
+                'usage_pct'             : self.usage,
+                'wait_ms'               : self.wait_ms,
                 'debt_ms'               : self.debt_ms,
                 'delay_ms'              : self.delay_ms,
-                'usage_pct'             : self.usage,
                 'address'               : self.address }
         return out
 
@@ -192,9 +195,10 @@ def table_row_str(self, path):
               f'{round(self.inuse):5}/{round(self.active):5} ' \
               f'{self.hwi_pct:6.2f}/{self.hwa_pct:6.2f} ' \
               f'{self.inflight_pct:6.2f} ' \
+              f'{min(self.usage, 999):6.2f} ' \
+              f'{self.wait_ms:7.2f} ' \
               f'{self.debt_ms:7.2f} ' \
-              f'{self.delay_ms:7.2f} '\
-              f'{min(self.usage, 999):6.2f}'
+              f'{self.delay_ms:7.2f}'
         out = out.rstrip(':')
         return out
 

From 9d4ed6d46272bb60036a6539aae74eafcbbb3d2d Mon Sep 17 00:00:00 2001
From: Andreas Hindborg <a.hindborg@samsung.com>
Date: Fri, 4 Aug 2023 13:46:08 +0200
Subject: [PATCH 069/113] ublk: add helper to check if device supports user
 copy

This will be used by ublk zoned storage support.

Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20230804114610.179530-2-nmi@metaspace.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 1c823750c95af..8d271901efac6 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -185,6 +185,11 @@ struct ublk_params_header {
 	__u32	types;
 };
 
+static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
+{
+	return ub->dev_info.flags & UBLK_F_USER_COPY;
+}
+
 static inline void __ublk_complete_rq(struct request *req);
 static void ublk_complete_rq(struct kref *ref);
 
@@ -2037,7 +2042,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 		UBLK_F_URING_CMD_COMP_IN_TASK;
 
 	/* GET_DATA isn't needed any more with USER_COPY */
-	if (ub->dev_info.flags & UBLK_F_USER_COPY)
+	if (ublk_dev_is_user_copy(ub))
 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
 
 	/* We are not ready to support zero copy */

From 1a6e88b9593b63ccdfe1d84e3f99dd91e4f8d490 Mon Sep 17 00:00:00 2001
From: Andreas Hindborg <a.hindborg@samsung.com>
Date: Fri, 4 Aug 2023 13:46:09 +0200
Subject: [PATCH 070/113] ublk: move check for empty address field on command
 submission

In preparation for zoned storage support, move the check for empty `addr`
field into the command handler case statement. Note that the check makes no
sense for `UBLK_IO_NEED_GET_DATA` because the `addr` field must always be
set for this command.

Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230804114610.179530-3-nmi@metaspace.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 8d271901efac6..0e38475796106 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1419,11 +1419,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
 		goto out;
 
-	if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	ret = ublk_check_cmd_op(cmd_op);
 	if (ret)
 		goto out;
@@ -1450,6 +1445,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 			 */
 			if (!ub_cmd->addr && !ublk_need_get_data(ubq))
 				goto out;
+		} else if (ub_cmd->addr) {
+			/* User copy requires addr to be unset */
+			ret = -EINVAL;
+			goto out;
 		}
 
 		ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
@@ -1469,7 +1468,12 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 			if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
 						req_op(req) == REQ_OP_READ))
 				goto out;
+		} else if (ub_cmd->addr) {
+			/* User copy requires addr to be unset */
+			ret = -EINVAL;
+			goto out;
 		}
+
 		ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
 		ublk_commit_completion(ub, ub_cmd);
 		break;

From 29802d7ca33bc0a75c9da2a143eeed4f9e99fca4 Mon Sep 17 00:00:00 2001
From: Andreas Hindborg <a.hindborg@samsung.com>
Date: Fri, 4 Aug 2023 13:46:10 +0200
Subject: [PATCH 071/113] ublk: enable zoned storage support

Add zoned storage support to ublk: report_zones and operations:
 - REQ_OP_ZONE_OPEN
 - REQ_OP_ZONE_CLOSE
 - REQ_OP_ZONE_FINISH
 - REQ_OP_ZONE_RESET
 - REQ_OP_ZONE_APPEND

The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to
communicate ALBA back to the kernel. Therefore ublk must be used with the
user copy feature (UBLK_F_USER_COPY) for zoned storage support to be
available. Without this feature, ublk will not allow zoned storage support.

Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230804114610.179530-4-nmi@metaspace.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 331 ++++++++++++++++++++++++++++++++--
 include/uapi/linux/ublk_cmd.h |  63 ++++++-
 2 files changed, 370 insertions(+), 24 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 0e38475796106..b60394fe7be69 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -56,16 +56,21 @@
 		| UBLK_F_USER_RECOVERY_REISSUE \
 		| UBLK_F_UNPRIVILEGED_DEV \
 		| UBLK_F_CMD_IOCTL_ENCODE \
-		| UBLK_F_USER_COPY)
+		| UBLK_F_USER_COPY \
+		| UBLK_F_ZONED)
 
 /* All UBLK_PARAM_TYPE_* should be included here */
-#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
-		UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
+#define UBLK_PARAM_TYPE_ALL                                \
+	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
+	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
 
 struct ublk_rq_data {
 	struct llist_node node;
 
 	struct kref ref;
+	__u64 sector;
+	__u32 operation;
+	__u32 nr_zones;
 };
 
 struct ublk_uring_cmd_pdu {
@@ -185,11 +190,263 @@ struct ublk_params_header {
 	__u32	types;
 };
 
+static inline unsigned int ublk_req_build_flags(struct request *req);
+static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
+						   int tag);
+
 static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
 {
 	return ub->dev_info.flags & UBLK_F_USER_COPY;
 }
 
+static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
+{
+	return ub->dev_info.flags & UBLK_F_ZONED;
+}
+
+static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
+{
+	return ubq->flags & UBLK_F_ZONED;
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+
+static int ublk_get_nr_zones(const struct ublk_device *ub)
+{
+	const struct ublk_param_basic *p = &ub->params.basic;
+
+	/* Zone size is a power of 2 */
+	return p->dev_sectors >> ilog2(p->chunk_sectors);
+}
+
+static int ublk_revalidate_disk_zones(struct ublk_device *ub)
+{
+	return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+}
+
+static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
+{
+	const struct ublk_param_zoned *p = &ub->params.zoned;
+	int nr_zones;
+
+	if (!ublk_dev_is_zoned(ub))
+		return -EINVAL;
+
+	if (!p->max_zone_append_sectors)
+		return -EINVAL;
+
+	nr_zones = ublk_get_nr_zones(ub);
+
+	if (p->max_active_zones > nr_zones)
+		return -EINVAL;
+
+	if (p->max_open_zones > nr_zones)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
+{
+	const struct ublk_param_zoned *p = &ub->params.zoned;
+
+	disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
+	blk_queue_required_elevator_features(ub->ub_disk->queue,
+					     ELEVATOR_F_ZBD_SEQ_WRITE);
+	disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
+	disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
+	blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
+
+	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
+
+	return 0;
+}
+
+/* Based on virtblk_alloc_report_buffer */
+static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
+				      unsigned int nr_zones, size_t *buflen)
+{
+	struct request_queue *q = ublk->ub_disk->queue;
+	size_t bufsize;
+	void *buf;
+
+	nr_zones = min_t(unsigned int, nr_zones,
+			 ublk->ub_disk->nr_zones);
+
+	bufsize = nr_zones * sizeof(struct blk_zone);
+	bufsize =
+		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
+
+	while (bufsize >= sizeof(struct blk_zone)) {
+		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
+		if (buf) {
+			*buflen = bufsize;
+			return buf;
+		}
+		bufsize >>= 1;
+	}
+
+	*buflen = 0;
+	return NULL;
+}
+
+static int ublk_report_zones(struct gendisk *disk, sector_t sector,
+		      unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+	struct ublk_device *ub = disk->private_data;
+	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
+	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
+	unsigned int done_zones = 0;
+	unsigned int max_zones_per_request;
+	int ret;
+	struct blk_zone *buffer;
+	size_t buffer_length;
+
+	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
+			 nr_zones);
+
+	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
+	if (!buffer)
+		return -ENOMEM;
+
+	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
+
+	while (done_zones < nr_zones) {
+		unsigned int remaining_zones = nr_zones - done_zones;
+		unsigned int zones_in_request =
+			min_t(unsigned int, remaining_zones, max_zones_per_request);
+		struct request *req;
+		struct ublk_rq_data *pdu;
+		blk_status_t status;
+
+		memset(buffer, 0, buffer_length);
+
+		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			goto out;
+		}
+
+		pdu = blk_mq_rq_to_pdu(req);
+		pdu->operation = UBLK_IO_OP_REPORT_ZONES;
+		pdu->sector = sector;
+		pdu->nr_zones = zones_in_request;
+
+		ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
+					GFP_KERNEL);
+		if (ret) {
+			blk_mq_free_request(req);
+			goto out;
+		}
+
+		status = blk_execute_rq(req, 0);
+		ret = blk_status_to_errno(status);
+		blk_mq_free_request(req);
+		if (ret)
+			goto out;
+
+		for (unsigned int i = 0; i < zones_in_request; i++) {
+			struct blk_zone *zone = buffer + i;
+
+			/* A zero length zone means no more zones in this response */
+			if (!zone->len)
+				break;
+
+			ret = cb(zone, i, data);
+			if (ret)
+				goto out;
+
+			done_zones++;
+			sector += zone_size_sectors;
+
+		}
+	}
+
+	ret = done_zones;
+
+out:
+	kvfree(buffer);
+	return ret;
+}
+
+static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
+					 struct request *req)
+{
+	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
+	struct ublk_io *io = &ubq->ios[req->tag];
+	struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
+	u32 ublk_op;
+
+	switch (req_op(req)) {
+	case REQ_OP_ZONE_OPEN:
+		ublk_op = UBLK_IO_OP_ZONE_OPEN;
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
+		break;
+	case REQ_OP_ZONE_FINISH:
+		ublk_op = UBLK_IO_OP_ZONE_FINISH;
+		break;
+	case REQ_OP_ZONE_RESET:
+		ublk_op = UBLK_IO_OP_ZONE_RESET;
+		break;
+	case REQ_OP_ZONE_APPEND:
+		ublk_op = UBLK_IO_OP_ZONE_APPEND;
+		break;
+	case REQ_OP_DRV_IN:
+		ublk_op = pdu->operation;
+		switch (ublk_op) {
+		case UBLK_IO_OP_REPORT_ZONES:
+			iod->op_flags = ublk_op | ublk_req_build_flags(req);
+			iod->nr_zones = pdu->nr_zones;
+			iod->start_sector = pdu->sector;
+			return BLK_STS_OK;
+		default:
+			return BLK_STS_IOERR;
+		}
+	case REQ_OP_ZONE_RESET_ALL:
+	case REQ_OP_DRV_OUT:
+		/* We do not support reset_all and drv_out */
+		return BLK_STS_NOTSUPP;
+	default:
+		return BLK_STS_IOERR;
+	}
+
+	iod->op_flags = ublk_op | ublk_req_build_flags(req);
+	iod->nr_sectors = blk_rq_sectors(req);
+	iod->start_sector = blk_rq_pos(req);
+	iod->addr = io->addr;
+
+	return BLK_STS_OK;
+}
+
+#else
+
+#define ublk_report_zones (NULL)
+
+static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ublk_revalidate_disk_zones(struct ublk_device *ub)
+{
+	return 0;
+}
+
+static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
+					 struct request *req)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif
+
 static inline void __ublk_complete_rq(struct request *req);
 static void ublk_complete_rq(struct kref *ref);
 
@@ -286,6 +543,9 @@ static int ublk_validate_params(const struct ublk_device *ub)
 
 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
 			return -EINVAL;
+
+		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
+			return -EINVAL;
 	} else
 		return -EINVAL;
 
@@ -304,6 +564,11 @@ static int ublk_validate_params(const struct ublk_device *ub)
 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
 		return -EINVAL;
 
+	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
+		return ublk_dev_param_zoned_validate(ub);
+	else if (ublk_dev_is_zoned(ub))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -317,6 +582,9 @@ static int ublk_apply_params(struct ublk_device *ub)
 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
 		ublk_dev_param_discard_apply(ub);
 
+	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
+		return ublk_dev_param_zoned_apply(ub);
+
 	return 0;
 }
 
@@ -487,6 +755,7 @@ static const struct block_device_operations ub_fops = {
 	.owner =	THIS_MODULE,
 	.open =		ublk_open,
 	.free_disk =	ublk_free_disk,
+	.report_zones =	ublk_report_zones,
 };
 
 #define UBLK_MAX_PIN_PAGES	32
@@ -601,7 +870,8 @@ static inline bool ublk_need_map_req(const struct request *req)
 
 static inline bool ublk_need_unmap_req(const struct request *req)
 {
-	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
+	return ublk_rq_has_data(req) &&
+	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
 }
 
 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
@@ -685,8 +955,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 {
 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
 	struct ublk_io *io = &ubq->ios[req->tag];
+	enum req_op op = req_op(req);
 	u32 ublk_op;
 
+	if (!ublk_queue_is_zoned(ubq) &&
+	    (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
+		return -EIO;
+
 	switch (req_op(req)) {
 	case REQ_OP_READ:
 		ublk_op = UBLK_IO_OP_READ;
@@ -704,6 +979,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
 		break;
 	default:
+		if (ublk_queue_is_zoned(ubq))
+			return ublk_setup_iod_zoned(ubq, req);
 		return BLK_STS_IOERR;
 	}
 
@@ -756,7 +1033,8 @@ static inline void __ublk_complete_rq(struct request *req)
 	 *
 	 * Both the two needn't unmap.
 	 */
-	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
+	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
+	    req_op(req) != REQ_OP_DRV_IN)
 		goto exit;
 
 	/* for READ request, writing data in iod->addr to rq buffers */
@@ -1120,6 +1398,9 @@ static void ublk_commit_completion(struct ublk_device *ub,
 	/* find the io request and complete */
 	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
 
+	if (req_op(req) == REQ_OP_ZONE_APPEND)
+		req->__sector = ub_cmd->zone_append_lba;
+
 	if (req && likely(!blk_should_fake_timeout(req->q)))
 		ublk_put_req_ref(ubq, req);
 }
@@ -1468,8 +1749,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 			if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
 						req_op(req) == REQ_OP_READ))
 				goto out;
-		} else if (ub_cmd->addr) {
-			/* User copy requires addr to be unset */
+		} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
+			/*
+			 * User copy requires addr to be unset when command is
+			 * not zone append
+			 */
 			ret = -EINVAL;
 			goto out;
 		}
@@ -1546,11 +1830,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
 		int ubuf_dir)
 {
 	/* copy ubuf to request pages */
-	if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
+	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
+	    ubuf_dir == ITER_SOURCE)
 		return true;
 
 	/* copy request pages to ubuf */
-	if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
+	if ((req_op(req) == REQ_OP_WRITE ||
+	     req_op(req) == REQ_OP_ZONE_APPEND) &&
+	    ubuf_dir == ITER_DEST)
 		return true;
 
 	return false;
@@ -1889,17 +2176,24 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
 
 	get_device(&ub->cdev_dev);
 	ub->dev_info.state = UBLK_S_DEV_LIVE;
+
+	if (ublk_dev_is_zoned(ub)) {
+		ret = ublk_revalidate_disk_zones(ub);
+		if (ret)
+			goto out_put_cdev;
+	}
+
 	ret = add_disk(disk);
+	if (ret)
+		goto out_put_cdev;
+
+	set_bit(UB_STATE_USED, &ub->state);
+
+out_put_cdev:
 	if (ret) {
-		/*
-		 * Has to drop the reference since ->free_disk won't be
-		 * called in case of add_disk failure.
-		 */
 		ub->dev_info.state = UBLK_S_DEV_DEAD;
 		ublk_put_device(ub);
-		goto out_put_disk;
 	}
-	set_bit(UB_STATE_USED, &ub->state);
 out_put_disk:
 	if (ret)
 		put_disk(disk);
@@ -2049,6 +2343,13 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	if (ublk_dev_is_user_copy(ub))
 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
 
+	/* Zoned storage support requires user copy feature */
+	if (ublk_dev_is_zoned(ub) &&
+	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
+		ret = -EINVAL;
+		goto out_free_dev_number;
+	}
+
 	/* We are not ready to support zero copy */
 	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 4b8558db90e1e..2685e53e47521 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -176,6 +176,12 @@
 /* Copy between request and user buffer by pread()/pwrite() */
 #define UBLK_F_USER_COPY	(1UL << 7)
 
+/*
+ * User space sets this flag when setting up the device to request zoned storage support. Kernel may
+ * deny the request by returning an error.
+ */
+#define UBLK_F_ZONED (1ULL << 8)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
@@ -232,9 +238,26 @@ struct ublksrv_ctrl_dev_info {
 #define		UBLK_IO_OP_READ		0
 #define		UBLK_IO_OP_WRITE		1
 #define		UBLK_IO_OP_FLUSH		2
-#define		UBLK_IO_OP_DISCARD	3
-#define		UBLK_IO_OP_WRITE_SAME	4
-#define		UBLK_IO_OP_WRITE_ZEROES	5
+#define		UBLK_IO_OP_DISCARD		3
+#define		UBLK_IO_OP_WRITE_SAME		4
+#define		UBLK_IO_OP_WRITE_ZEROES		5
+#define		UBLK_IO_OP_ZONE_OPEN		10
+#define		UBLK_IO_OP_ZONE_CLOSE		11
+#define		UBLK_IO_OP_ZONE_FINISH		12
+#define		UBLK_IO_OP_ZONE_APPEND		13
+#define		UBLK_IO_OP_ZONE_RESET		15
+/*
+ * Construct a zone report. The report request is carried in `struct
+ * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
+ * and shall indicate the first zone of the report. The `nr_zones` shall
+ * indicate how many zones should be reported at most. The report shall be
+ * delivered as a `struct blk_zone` array. To report fewer zones than requested,
+ * zero the last entry of the returned array.
+ *
+ * Related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in
+ * include/uapi/linux/blkzoned.h are part of ublk UAPI.
+ */
+#define		UBLK_IO_OP_REPORT_ZONES		18
 
 #define		UBLK_IO_F_FAILFAST_DEV		(1U << 8)
 #define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
@@ -255,7 +278,10 @@ struct ublksrv_io_desc {
 	/* op: bit 0-7, flags: bit 8-31 */
 	__u32		op_flags;
 
-	__u32		nr_sectors;
+	union {
+		__u32		nr_sectors;
+		__u32		nr_zones; /* for UBLK_IO_OP_REPORT_ZONES */
+	};
 
 	/* start sector for this io */
 	__u64		start_sector;
@@ -284,11 +310,21 @@ struct ublksrv_io_cmd {
 	/* io result, it is valid for COMMIT* command only */
 	__s32	result;
 
-	/*
-	 * userspace buffer address in ublksrv daemon process, valid for
-	 * FETCH* command only
-	 */
-	__u64	addr;
+	union {
+		/*
+		 * userspace buffer address in ublksrv daemon process, valid for
+		 * FETCH* command only
+		 *
+		 * `addr` should not be used when UBLK_F_USER_COPY is enabled,
+		 * because userspace handles data copy by pread()/pwrite() over
+		 * /dev/ublkcN. But in case of UBLK_F_ZONED, this union is
+		 * re-used to pass back the allocated LBA for
+		 * UBLK_IO_OP_ZONE_APPEND which actually depends on
+		 * UBLK_F_USER_COPY
+		 */
+		__u64	addr;
+		__u64	zone_append_lba;
+	};
 };
 
 struct ublk_param_basic {
@@ -331,6 +367,13 @@ struct ublk_param_devt {
 	__u32   disk_minor;
 };
 
+struct ublk_param_zoned {
+	__u32	max_open_zones;
+	__u32	max_active_zones;
+	__u32	max_zone_append_sectors;
+	__u8	reserved[20];
+};
+
 struct ublk_params {
 	/*
 	 * Total length of parameters, userspace has to set 'len' for both
@@ -342,11 +385,13 @@ struct ublk_params {
 #define UBLK_PARAM_TYPE_BASIC           (1 << 0)
 #define UBLK_PARAM_TYPE_DISCARD         (1 << 1)
 #define UBLK_PARAM_TYPE_DEVT            (1 << 2)
+#define UBLK_PARAM_TYPE_ZONED           (1 << 3)
 	__u32	types;			/* types of parameter included */
 
 	struct ublk_param_basic		basic;
 	struct ublk_param_discard	discard;
 	struct ublk_param_devt		devt;
+	struct ublk_param_zoned	zoned;
 };
 
 #endif

From d47f9717e5cfd0dd8c0ba2ecfa47c38d140f1bb6 Mon Sep 17 00:00:00 2001
From: Zhiguo Niu <zhiguo.niu@unisoc.com>
Date: Thu, 3 Aug 2023 19:12:42 +0800
Subject: [PATCH 072/113] block/mq-deadline: use correct way to throttling
 write requests

The original formula was inaccurate:
dd->async_depth = max(1UL, 3 * q->nr_requests / 4);

For write requests, when we assign a tags from sched_tags,
data->shallow_depth will be passed to sbitmap_find_bit,
see the following code:

nr = sbitmap_find_bit_in_word(&sb->map[index],
			min_t (unsigned int,
			__map_depth(sb, index),
			depth),
			alloc_hint, wrap);

The smaller of data->shallow_depth and __map_depth(sb, index)
will be used as the maximum range when allocating bits.

For a mmc device (one hw queue, deadline I/O scheduler):
q->nr_requests = sched_tags = 128, so according to the previous
calculation method, dd->async_depth = data->shallow_depth = 96,
and the platform is 64bits with 8 cpus, sched_tags.bitmap_tags.sb.shift=5,
sb.maps[]=32/32/32/32, 32 is smaller than 96, whether it is a read or
a write I/O, tags can be allocated to the maximum range each time,
which has not throttling effect.

In addition, refer to the methods of bfg/kyber I/O scheduler,
limit ratiois are calculated base on sched_tags.bitmap_tags.sb.shift.

This patch can throttle write requests really.

Fixes: 07757588e507 ("block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests")

Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/1691061162-22898-1-git-send-email-zhiguo.niu@unisoc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/mq-deadline.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 02a916ba62ee7..f958e79277b8b 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -646,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
 	struct blk_mq_tags *tags = hctx->sched_tags;
+	unsigned int shift = tags->bitmap_tags.sb.shift;
 
-	dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
+	dd->async_depth = max(1U, 3 * (1U << shift)  / 4);
 
 	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
 }

From 7c8998f75d2d42ddefb172239b0f689392958309 Mon Sep 17 00:00:00 2001
From: Jinyoung Choi <j-young.choi@samsung.com>
Date: Thu, 3 Aug 2023 11:48:27 +0900
Subject: [PATCH 073/113] block: make bvec_try_merge_hw_page() non-static

This will be used for multi-page configuration for integrity payload.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jinyoung Choi <j-young.choi@samsung.com>
Tested-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20230803024827epcms2p838d9e9131492c86a159fff25d195658f@epcms2p8
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 2 +-
 block/blk.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index c92dda962449b..8d1533af7c609 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -934,7 +934,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
  * size limit.  This is not for normal read/write bios, but for passthrough
  * or Zone Append operations that we can't split.
  */
-static bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
+bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
 		struct page *page, unsigned len, unsigned offset,
 		bool *same_page)
 {
diff --git a/block/blk.h b/block/blk.h
index 686712e138352..9d22ec3a53bcf 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -75,6 +75,10 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
 		gfp_t gfp_mask);
 void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
 
+bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
+		struct page *page, unsigned len, unsigned offset,
+		bool *same_page);
+
 static inline bool biovec_phys_mergeable(struct request_queue *q,
 		struct bio_vec *vec1, struct bio_vec *vec2)
 {

From 80814b8e359f7207595f52702aea432a7bd61200 Mon Sep 17 00:00:00 2001
From: Jinyoung Choi <j-young.choi@samsung.com>
Date: Thu, 3 Aug 2023 11:49:56 +0900
Subject: [PATCH 074/113] bio-integrity: update the payload size in
 bio_integrity_add_page()

Previously, the bip's bi_size has been set before an integrity pages
were added. If a problem occurs in the process of adding pages for
bip, the bi_size mismatch problem must be dealt with.

When the page is successfully added to bvec, the bi_size is updated.

The parts affected by the change were also contained in this commit.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jinyoung Choi <j-young.choi@samsung.com>
Tested-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20230803024956epcms2p38186a17392706650c582d38ef3dbcd32@epcms2p3
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c               | 2 +-
 drivers/md/dm-crypt.c               | 1 -
 drivers/nvme/host/ioctl.c           | 1 -
 drivers/nvme/target/io-cmd-bdev.c   | 3 +--
 drivers/target/target_core_iblock.c | 3 +--
 5 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 045553a164e0c..6220a99977a42 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -137,6 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 
 	bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset);
 	bip->bip_vcnt++;
+	bip->bip_iter.bi_size += len;
 
 	return len;
 }
@@ -244,7 +245,6 @@ bool bio_integrity_prep(struct bio *bio)
 	}
 
 	bip->bip_flags |= BIP_BLOCK_INTEGRITY;
-	bip->bip_iter.bi_size = len;
 	bip_set_seed(bip, bio->bi_iter.bi_sector);
 
 	if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1dc6227d353ec..f2662c21a6dfe 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1160,7 +1160,6 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
 
 	tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift);
 
-	bip->bip_iter.bi_size = tag_len;
 	bip->bip_iter.bi_sector = io->cc->start + io->sector;
 
 	ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 5c3250f36ce77..19a5177bc3604 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -118,7 +118,6 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
 		goto out_free_meta;
 	}
 
-	bip->bip_iter.bi_size = len;
 	bip->bip_iter.bi_sector = seed;
 	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
 			offset_in_page(buf));
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 2733e01585854..468833675cc94 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -206,12 +206,11 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
 		return PTR_ERR(bip);
 	}
 
-	bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
 	/* virtual start sector must be in integrity interval units */
 	bip_set_seed(bip, bio->bi_iter.bi_sector >>
 		     (bi->interval_exp - SECTOR_SHIFT));
 
-	resid = bip->bip_iter.bi_size;
+	resid = bio_integrity_bytes(bi, bio_sectors(bio));
 	while (resid > 0 && sg_miter_next(miter)) {
 		len = min_t(size_t, miter->length, resid);
 		rc = bio_integrity_add_page(bio, miter->page, len,
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 3d1b511ea284b..a7050f63b7cc1 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -689,7 +689,6 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio,
 		return PTR_ERR(bip);
 	}
 
-	bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
 	/* virtual start sector must be in integrity interval units */
 	bip_set_seed(bip, bio->bi_iter.bi_sector >>
 				  (bi->interval_exp - SECTOR_SHIFT));
@@ -697,7 +696,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio,
 	pr_debug("IBLOCK BIP Size: %u Sector: %llu\n", bip->bip_iter.bi_size,
 		 (unsigned long long)bip->bip_iter.bi_sector);
 
-	resid = bip->bip_iter.bi_size;
+	resid = bio_integrity_bytes(bi, bio_sectors(bio));
 	while (resid > 0 && sg_miter_next(miter)) {
 
 		len = min_t(size_t, miter->length, resid);

From d1f04c2e23c99258049c6081c3147bae69e5bcb8 Mon Sep 17 00:00:00 2001
From: Jinyoung Choi <j-young.choi@samsung.com>
Date: Thu, 3 Aug 2023 11:50:58 +0900
Subject: [PATCH 075/113] bio-integrity: cleanup adding integrity pages to
 bip's bvec.

bio_integrity_add_page() returns the add length if successful, else 0,
just as bio_add_page.  Simply check return value checking in
bio_integrity_prep to not deal with a > 0 but < len case that can't
happen.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jinyoung Choi <j-young.choi@samsung.com>
Tested-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20230803025058epcms2p5a4d0db5da2ad967668932d463661c633@epcms2p5
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 6220a99977a42..c6b3bc86e1f9e 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -252,27 +252,18 @@ bool bio_integrity_prep(struct bio *bio)
 
 	/* Map it */
 	offset = offset_in_page(buf);
-	for (i = 0 ; i < nr_pages ; i++) {
-		int ret;
+	for (i = 0; i < nr_pages && len > 0; i++) {
 		bytes = PAGE_SIZE - offset;
 
-		if (len <= 0)
-			break;
-
 		if (bytes > len)
 			bytes = len;
 
-		ret = bio_integrity_add_page(bio, virt_to_page(buf),
-					     bytes, offset);
-
-		if (ret == 0) {
+		if (bio_integrity_add_page(bio, virt_to_page(buf),
+					   bytes, offset) < bytes) {
 			printk(KERN_ERR "could not attach integrity payload\n");
 			goto err_end_io;
 		}
 
-		if (ret < bytes)
-			break;
-
 		buf += bytes;
 		len -= bytes;
 		offset = 0;
@@ -291,7 +282,6 @@ bool bio_integrity_prep(struct bio *bio)
 	bio->bi_status = BLK_STS_RESOURCE;
 	bio_endio(bio);
 	return false;
-
 }
 EXPORT_SYMBOL(bio_integrity_prep);
 

From 0ece1d649b6dd615925a72bc1824d6b9fa5b998a Mon Sep 17 00:00:00 2001
From: Jinyoung Choi <j-young.choi@samsung.com>
Date: Thu, 3 Aug 2023 11:52:02 +0900
Subject: [PATCH 076/113] bio-integrity: create multi-page bvecs in
 bio_integrity_add_page()

In general, the bvec data structure consists of one for physically
continuous pages. But, in the bvec configuration for bip, physically
continuous integrity pages are composed of each bvec.

Allow bio_integrity_add_page() to create multi-page bvecs, just like
the bio payloads. This simplifies adding larger payloads, and fixes
support for non-tiny workloads with nvme, which stopped using
scatterlist for metadata a while ago.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>

Fixes: 783b94bd9250 ("nvme-pci: do not build a scatterlist to map metadata")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jinyoung Choi <j-young.choi@samsung.com>
Tested-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20230803025202epcms2p82f57cbfe32195da38c776377b55aed59@epcms2p8
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index c6b3bc86e1f9e..ec8ac8cf6e1b9 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -123,17 +123,34 @@ void bio_integrity_free(struct bio *bio)
 int bio_integrity_add_page(struct bio *bio, struct page *page,
 			   unsigned int len, unsigned int offset)
 {
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	struct bio_integrity_payload *bip = bio_integrity(bio);
 
-	if (bip->bip_vcnt >= bip->bip_max_vcnt) {
-		printk(KERN_ERR "%s: bip_vec full\n", __func__);
+	if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) >
+	    queue_max_hw_sectors(q))
 		return 0;
-	}
 
-	if (bip->bip_vcnt &&
-	    bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits,
-			     &bip->bip_vec[bip->bip_vcnt - 1], offset))
-		return 0;
+	if (bip->bip_vcnt > 0) {
+		struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
+		bool same_page = false;
+
+		if (bvec_try_merge_hw_page(q, bv, page, len, offset,
+					   &same_page)) {
+			bip->bip_iter.bi_size += len;
+			return len;
+		}
+
+		if (bip->bip_vcnt >=
+		    min(bip->bip_max_vcnt, queue_max_integrity_segments(q)))
+			return 0;
+
+		/*
+		 * If the queue doesn't support SG gaps and adding this segment
+		 * would create a gap, disallow it.
+		 */
+		if (bvec_gap_to_prev(&q->limits, bv, offset))
+			return 0;
+	}
 
 	bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset);
 	bip->bip_vcnt++;

From c8659bbb15cd42577a9b16a23b527436b028c8b2 Mon Sep 17 00:00:00 2001
From: Li Zetao <lizetao1@huawei.com>
Date: Thu, 10 Aug 2023 16:48:36 +0800
Subject: [PATCH 077/113] ublk: Fix signedness bug returning warning

There are two warnings reported by smatch:

drivers/block/ublk_drv.c:445 ublk_setup_iod_zoned() warn:
	signedness bug returning '(-95)'
drivers/block/ublk_drv.c:963 ublk_setup_iod() warn:
	signedness bug returning '(-5)'

The type of "blk_status_t" is either be a u32 or u8, but this two
functions return a negative value when not supported or failed. Use
the error code of the blk module to fix these warnings.

Fixes: 29802d7ca33b ("ublk: enable zoned storage support")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/r/202308100201.TCRhgdvN-lkp@intel.com/
Signed-off-by: Li Zetao <lizetao1@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230810084836.3535322-1-lizetao1@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index b60394fe7be69..109a5b17537d2 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -442,7 +442,7 @@ static int ublk_revalidate_disk_zones(struct ublk_device *ub)
 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
 					 struct request *req)
 {
-	return -EOPNOTSUPP;
+	return BLK_STS_NOTSUPP;
 }
 
 #endif
@@ -960,7 +960,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 
 	if (!ublk_queue_is_zoned(ubq) &&
 	    (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
-		return -EIO;
+		return BLK_STS_IOERR;
 
 	switch (req_op(req)) {
 	case REQ_OP_READ:

From 4eb44d10766ac0fae5973998fd2a0103df1d3fe1 Mon Sep 17 00:00:00 2001
From: Li Lingfeng <lilingfeng3@huawei.com>
Date: Thu, 10 Aug 2023 11:51:11 +0800
Subject: [PATCH 078/113] block: remove init_mutex and open-code
 blk_iolatency_try_init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit a13696b83da4 ("blk-iolatency: Make initialization lazy") adds
a mutex named "init_mutex" in blk_iolatency_try_init for the race
condition of initializing RQ_QOS_LATENCY.
Now a new lock has been add to struct request_queue by commit a13bd91be223
("block/rq_qos: protect rq_qos apis with a new lock"). And it has been
held in blkg_conf_open_bdev before calling blk_iolatency_init.
So it's not necessary to keep init_mutex in blk_iolatency_try_init, just
remove it.

Since init_mutex has been removed, blk_iolatency_try_init can be
open-coded back to iolatency_set_limit() like ioc_qos_write().

Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Link: https://lore.kernel.org/r/20230810035111.2236335-1-lilingfeng@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iolatency.c | 35 +++++++++++------------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index fd5fec989e390..c16aef4be0363 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -824,29 +824,6 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg)
 	}
 }
 
-static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx)
-{
-	static DEFINE_MUTEX(init_mutex);
-	int ret;
-
-	ret = blkg_conf_open_bdev(ctx);
-	if (ret)
-		return ret;
-
-	/*
-	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
-	 * confuse iolat_rq_qos() test. Make the test and init atomic.
-	 */
-	mutex_lock(&init_mutex);
-
-	if (!iolat_rq_qos(ctx->bdev->bd_queue))
-		ret = blk_iolatency_init(ctx->bdev->bd_disk);
-
-	mutex_unlock(&init_mutex);
-
-	return ret;
-}
-
 static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 			     size_t nbytes, loff_t off)
 {
@@ -861,7 +838,17 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 
 	blkg_conf_init(&ctx, buf);
 
-	ret = blk_iolatency_try_init(&ctx);
+	ret = blkg_conf_open_bdev(&ctx);
+	if (ret)
+		goto out;
+
+	/*
+	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
+	 * confuse iolat_rq_qos() test. Make the test and init atomic.
+	 */
+	lockdep_assert_held(ctx.bdev->bd_queue->rq_qos_mutex);
+	if (!iolat_rq_qos(ctx.bdev->bd_queue))
+		ret = blk_iolatency_init(ctx.bdev->bd_disk);
 	if (ret)
 		goto out;
 

From d21fed50c523d87af6456697ad09378060c4f09a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 10 Aug 2023 16:19:23 +0200
Subject: [PATCH 079/113] swim3: mark swim3_init() static

This is the module init function, which by definition is used only
locally, so mark it static to avoid a warning:

drivers/block/swim3.c:1280:5: error: no previous prototype for 'swim3_init' [-Werror=missing-prototypes]

Reviewed-by: Jack Wang <jinpu.wang@ionos.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/swim3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index dc43a63b34694..c2bc85826358e 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1277,7 +1277,7 @@ static struct macio_driver swim3_driver =
 };
 
 
-int swim3_init(void)
+static int swim3_init(void)
 {
 	macio_register_driver(&swim3_driver);
 	return 0;

From 18267a0365d6ec8bbe85ba8cbea5af12d9e59610 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Aug 2023 17:24:53 -0600
Subject: [PATCH 080/113] block: fix bad lockdep annotation in blk-iolatency

A previous commit added a lockdep annotation, but botched it. Use the
right type.

Fixes: 4eb44d10766a ("block: remove init_mutex and open-code blk_iolatency_try_init")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iolatency.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index c16aef4be0363..c1a6aba1d59e4 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -846,7 +846,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
 	 * confuse iolat_rq_qos() test. Make the test and init atomic.
 	 */
-	lockdep_assert_held(ctx.bdev->bd_queue->rq_qos_mutex);
+	lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex);
 	if (!iolat_rq_qos(ctx.bdev->bd_queue))
 		ret = blk_iolatency_init(ctx.bdev->bd_disk);
 	if (ret)

From e24721e441a7c640e4e7b2b63c23c06d9a750880 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 11 Aug 2023 21:52:16 +0800
Subject: [PATCH 081/113] ublk: fix 'warn: variable dereferenced before check
 'req'' from Smatch

The added check of 'req_op(req) == REQ_OP_ZONE_APPEND' should have been
done after the request is confirmed as valid.

Actually here, the request should always been true, so add one
WARN_ON_ONCE(!req), meantime move the zone_append check after
checking the request.

Cc: Andreas Hindborg <a.hindborg@samsung.com>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Fixes: 29802d7ca33b ("ublk: enable zoned storage support")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230811135216.420404-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 109a5b17537d2..e85e075b5bce1 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1397,11 +1397,13 @@ static void ublk_commit_completion(struct ublk_device *ub,
 
 	/* find the io request and complete */
 	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
+	if (WARN_ON_ONCE(unlikely(!req)))
+		return;
 
 	if (req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = ub_cmd->zone_append_lba;
 
-	if (req && likely(!blk_should_fake_timeout(req->q)))
+	if (likely(!blk_should_fake_timeout(req->q)))
 		ublk_put_req_ref(ubq, req);
 }
 

From 7ba3792718709d410be5d971732b9251cbda67b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Aug 2023 14:26:34 -0400
Subject: [PATCH 082/113] block: Add some exports for bcachefs

 - bio_set_pages_dirty(), bio_check_pages_dirty() - dio path
 - blk_status_to_str() - error messages
 - bio_add_folio() - this should definitely be exported for everyone,
   it's the modern version of bio_add_page()

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: linux-block@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Link: https://lore.kernel.org/r/20230813182636.2966159-2-kent.overstreet@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c            | 2 ++
 block/blk-core.c       | 1 +
 block/blk.h            | 1 -
 include/linux/blkdev.h | 1 +
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 8d1533af7c609..bb3ea4e05d4ca 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1472,6 +1472,7 @@ void bio_set_pages_dirty(struct bio *bio)
 			set_page_dirty_lock(bvec->bv_page);
 	}
 }
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
 
 /*
  * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
@@ -1531,6 +1532,7 @@ void bio_check_pages_dirty(struct bio *bio)
 	spin_unlock_irqrestore(&bio_dirty_lock, flags);
 	schedule_work(&bio_dirty_work);
 }
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
 static inline bool bio_remaining_done(struct bio *bio)
 {
diff --git a/block/blk-core.c b/block/blk-core.c
index 99d8b9812b18f..141e54545cc1d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -208,6 +208,7 @@ const char *blk_status_to_str(blk_status_t status)
 		return "<null>";
 	return blk_errors[idx].name;
 }
+EXPORT_SYMBOL_GPL(blk_status_to_str);
 
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
diff --git a/block/blk.h b/block/blk.h
index 9d22ec3a53bcf..08a358bc0919e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -254,7 +254,6 @@ static inline void bio_integrity_free(struct bio *bio)
 
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
-const char *blk_status_to_str(blk_status_t status);
 
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2f5371b8482c0..4feed1fc141f6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -847,6 +847,7 @@ extern const char *blk_op_str(enum req_op op);
 
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
+const char *blk_status_to_str(blk_status_t status);
 
 /* only poll the hardware once, don't continue until a completion was found */
 #define BLK_POLL_ONESHOT		(1 << 0)

From 168145f617d57bf4e474901b7ffa869337a802e6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 13 Aug 2023 14:26:35 -0400
Subject: [PATCH 083/113] block: Allow bio_iov_iter_get_pages() with
 bio->bi_bdev unset

bio_iov_iter_get_pages() trims the IO based on the block size of the
block device the IO will be issued to.

However, bcachefs is a multi device filesystem; when we're creating the
bio we don't yet know which block device the bio will be submitted to -
we have to handle the alignment checks elsewhere.

Thus this is needed to avoid a null ptr deref.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Link: https://lore.kernel.org/r/20230813182636.2966159-3-kent.overstreet@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index bb3ea4e05d4ca..be484d87142b8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1231,7 +1231,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	struct page **pages = (struct page **)bv;
 	ssize_t size, left;
 	unsigned len, i = 0;
-	size_t offset, trim;
+	size_t offset;
 	int ret = 0;
 
 	/*
@@ -1260,10 +1260,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 
 	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
 
-	trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
-	iov_iter_revert(iter, trim);
+	if (bio->bi_bdev) {
+		size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
+		iov_iter_revert(iter, trim);
+		size -= trim;
+	}
 
-	size -= trim;
 	if (unlikely(!size)) {
 		ret = -EFAULT;
 		goto out;

From 649f070e69739d22c57c22dbce0788b72cd93fac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 13 Aug 2023 14:26:36 -0400
Subject: [PATCH 084/113] block: Bring back zero_fill_bio_iter

This reverts 6f822e1b5d9dda3d20e87365de138046e3baa03a - this helper is
used by bcachefs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Link: https://lore.kernel.org/r/20230813182636.2966159-4-kent.overstreet@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 6 +++---
 include/linux/bio.h | 7 ++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index be484d87142b8..816d412c06e9b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_kmalloc);
 
-void zero_fill_bio(struct bio *bio)
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
 {
 	struct bio_vec bv;
 	struct bvec_iter iter;
 
-	bio_for_each_segment(bv, bio, iter)
+	__bio_for_each_segment(bv, bio, iter, start)
 		memzero_bvec(&bv);
 }
-EXPORT_SYMBOL(zero_fill_bio);
+EXPORT_SYMBOL(zero_fill_bio_iter);
 
 /**
  * bio_truncate - truncate the bio to small size of @new_size
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c4f5b5228105f..8b99210eb7fb0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -488,7 +488,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern void bio_free_pages(struct bio *bio);
 void guard_bio_eod(struct bio *bio);
-void zero_fill_bio(struct bio *bio);
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
+
+static inline void zero_fill_bio(struct bio *bio)
+{
+	zero_fill_bio_iter(bio, bio->bi_iter);
+}
 
 static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
 {

From c7b4b23b36edf32239e7fc3b922797ff1d32b072 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Tue, 15 Aug 2023 06:58:32 +0900
Subject: [PATCH 085/113] block: uapi: Fix compilation errors using ioprio.h
 with C++
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The use of the "class" argument name in the ioprio_value() inline
function in include/uapi/linux/ioprio.h confuses C++ compilers
resulting in compilation errors such as:

/usr/include/linux/ioprio.h:110:43: error: expected primary-expression before ‘int’
  110 | static __always_inline __u16 ioprio_value(int class, int level, int hint)
      |                                           ^~~

for user C++ programs including linux/ioprio.h.

Avoid these errors by renaming the arguments of the ioprio_value()
function to prioclass, priolevel and priohint. For consistency, the
arguments of the IOPRIO_PRIO_VALUE() and IOPRIO_PRIO_VALUE_HINT() macros
are also renamed in the same manner.

Reported-by: Igor Pylypiv <ipylypiv@google.com>
Fixes: 01584c1e2337 ("scsi: block: Improve ioprio value validity checks")
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Tested-by: Igor Pylypiv <ipylypiv@google.com>
Link: https://lore.kernel.org/r/20230814215833.259286-1-dlemoal@kernel.org
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/ioprio.h | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h
index 99440b2e8c352..bee2bdb0eedbc 100644
--- a/include/uapi/linux/ioprio.h
+++ b/include/uapi/linux/ioprio.h
@@ -107,20 +107,21 @@ enum {
 /*
  * Return an I/O priority value based on a class, a level and a hint.
  */
-static __always_inline __u16 ioprio_value(int class, int level, int hint)
+static __always_inline __u16 ioprio_value(int prioclass, int priolevel,
+					  int priohint)
 {
-	if (IOPRIO_BAD_VALUE(class, IOPRIO_NR_CLASSES) ||
-	    IOPRIO_BAD_VALUE(level, IOPRIO_NR_LEVELS) ||
-	    IOPRIO_BAD_VALUE(hint, IOPRIO_NR_HINTS))
+	if (IOPRIO_BAD_VALUE(prioclass, IOPRIO_NR_CLASSES) ||
+	    IOPRIO_BAD_VALUE(priolevel, IOPRIO_NR_LEVELS) ||
+	    IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS))
 		return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT;
 
-	return (class << IOPRIO_CLASS_SHIFT) |
-		(hint << IOPRIO_HINT_SHIFT) | level;
+	return (prioclass << IOPRIO_CLASS_SHIFT) |
+		(priohint << IOPRIO_HINT_SHIFT) | priolevel;
 }
 
-#define IOPRIO_PRIO_VALUE(class, level)			\
-	ioprio_value(class, level, IOPRIO_HINT_NONE)
-#define IOPRIO_PRIO_VALUE_HINT(class, level, hint)	\
-	ioprio_value(class, level, hint)
+#define IOPRIO_PRIO_VALUE(prioclass, priolevel)			\
+	ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE)
+#define IOPRIO_PRIO_VALUE_HINT(prioclass, priolevel, priohint)	\
+	ioprio_value(prioclass, priolevel, priohint)
 
 #endif /* _UAPI_LINUX_IOPRIO_H */

From 66a6a5d0ec852eaced589da066376e69397cd71e Mon Sep 17 00:00:00 2001
From: Ruan Jinjie <ruanjinjie@huawei.com>
Date: Tue, 15 Aug 2023 19:48:14 +0800
Subject: [PATCH 086/113] ublk: Switch to memdup_user_nul() helper

Use memdup_user_nul() helper instead of open-coding
to simplify the code.

Signed-off-by: Ruan Jinjie <ruanjinjie@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230815114815.1551171-1-ruanjinjie@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index e85e075b5bce1..6ecd728ee5cea 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2742,14 +2742,9 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
 	if (header->len < header->dev_path_len)
 		return -EINVAL;
 
-	dev_path = kmalloc(header->dev_path_len + 1, GFP_KERNEL);
-	if (!dev_path)
-		return -ENOMEM;
-
-	ret = -EFAULT;
-	if (copy_from_user(dev_path, argp, header->dev_path_len))
-		goto exit;
-	dev_path[header->dev_path_len] = 0;
+	dev_path = memdup_user_nul(argp, header->dev_path_len);
+	if (IS_ERR(dev_path))
+		return PTR_ERR(dev_path);
 
 	ret = -EINVAL;
 	switch (_IOC_NR(cmd->cmd_op)) {

From a705b11b358dee677aad80630e7608b2d5f56691 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Sat, 8 Jul 2023 17:17:27 +0800
Subject: [PATCH 087/113] md/raid5-cache: fix a deadlock in r5l_exit_log()

Commit b13015af94cf ("md/raid5-cache: Clear conf->log after finishing
work") introduce a new problem:

// caller hold reconfig_mutex
r5l_exit_log
 flush_work(&log->disable_writeback_work)
			r5c_disable_writeback_async
			 wait_event
			  /*
			   * conf->log is not NULL, and mddev_trylock()
			   * will fail, wait_event() can never pass.
			   */
 conf->log = NULL

Fix this problem by setting 'config->log' to NULL before wake_up() as it
used to be, so that wait_event() from r5c_disable_writeback_async() can
exist. In the meantime, move forward md_unregister_thread() so that
null-ptr-deref this commit fixed can still be fixed.

Fixes: b13015af94cf ("md/raid5-cache: Clear conf->log after finishing work")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230708091727.1417894-1-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid5-cache.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 47ba7d9e81e18..2eac4a50d99bd 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -3168,12 +3168,15 @@ void r5l_exit_log(struct r5conf *conf)
 {
 	struct r5l_log *log = conf->log;
 
-	/* Ensure disable_writeback_work wakes up and exits */
-	wake_up(&conf->mddev->sb_wait);
-	flush_work(&log->disable_writeback_work);
 	md_unregister_thread(&log->reclaim_thread);
 
+	/*
+	 * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to
+	 * ensure disable_writeback_work wakes up and exits.
+	 */
 	conf->log = NULL;
+	wake_up(&conf->mddev->sb_wait);
+	flush_work(&log->disable_writeback_work);
 
 	mempool_exit(&log->meta_pool);
 	bioset_exit(&log->bs);

From 8b0472b50bcf0f19a5119b00a53b63579c8e1e4d Mon Sep 17 00:00:00 2001
From: Zhang Shurong <zhang_shurong@foxmail.com>
Date: Sat, 22 Jul 2023 15:53:53 +0800
Subject: [PATCH 088/113] md: raid1: fix potential OOB in raid1_remove_disk()

If rddev->raid_disk is greater than mddev->raid_disks, there will be
an out-of-bounds in raid1_remove_disk(). We have already found
similar reports as follows:

1) commit d17f744e883b ("md-raid10: fix KASAN warning")
2) commit 1ebc2cec0b7d ("dm raid: fix KASAN warning in raid5_remove_disk")

Fix this bug by checking whether the "number" variable is
valid.

Signed-off-by: Zhang Shurong <zhang_shurong@foxmail.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/tencent_0D24426FAC6A21B69AC0C03CE4143A508F09@qq.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 23d211969565e..b920f92780139 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1837,6 +1837,10 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct r1conf *conf = mddev->private;
 	int err = 0;
 	int number = rdev->raid_disk;
+
+	if (unlikely(number >= conf->raid_disks))
+		goto abort;
+
 	struct raid1_info *p = conf->mirrors + number;
 
 	if (rdev != p->rdev)

From 892da88d1cd93426e9c6d7717876ca705fe2b9fa Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 31 Jul 2023 10:28:00 +0800
Subject: [PATCH 089/113] md/raid10: fix a 'conf->barrier' leakage in
 raid10_takeover()

After commit b39f35ebe86d ("md: don't quiesce in mddev_suspend()"),
'conf->barrier' will be leaked in the case that raid10 takeover raid0:

level_store
 pers->takeover -> raid10_takeover
  raid10_takeover_raid0
   WRITE_ONCE(conf->barrier, 1)

mddev_suspend
// still raid0
mddev->pers = pers
// switch to raid10
mddev_resume
// resume without suspend

After the above commit, mddev_resume() will not decrease 'conf->barrier'
that is set in raid10_takeover_raid0().

Fix this problem by not setting 'conf->barrier' in raid10_takeover_raid0().

By the way, this problem is found while I'm trying to make
mddev_suspend/resume() to be independent from raid personalities. raid10
is the only personality to use reference count in the quiesce() callback
and this problem is only related to raid10.

Fixes: b39f35ebe86d ("md: don't quiesce in mddev_suspend()")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Link: https://lore.kernel.org/r/20230731022800.1424902-1-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid10.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 16aa9d735880a..7704a4c7f4695 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4417,7 +4417,6 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
 				rdev->new_raid_disk = rdev->raid_disk * 2;
 				rdev->sectors = size;
 			}
-		WRITE_ONCE(conf->barrier, 1);
 	}
 
 	return conf;

From 7eb8ff02c1df279bf7f7f29b866beb655a9eebe9 Mon Sep 17 00:00:00 2001
From: Li Lingfeng <lilingfeng3@huawei.com>
Date: Thu, 3 Aug 2023 15:17:11 +0800
Subject: [PATCH 090/113] md: Hold mddev->reconfig_mutex when trying to get
 mddev->sync_thread

Commit ba9d9f1a707f ("Revert "md: unlock mddev before reap sync_thread in
action_store"") removed the scenario of calling md_unregister_thread()
without holding mddev->reconfig_mutex, so add a lock holding check before
acquiring mddev->sync_thread by passing mdev to md_unregister_thread().

Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230803071711.2546560-1-lilingfeng@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md-cluster.c  | 8 ++++----
 drivers/md/md.c          | 9 +++++----
 drivers/md/md.h          | 2 +-
 drivers/md/raid1.c       | 4 ++--
 drivers/md/raid10.c      | 2 +-
 drivers/md/raid5-cache.c | 2 +-
 drivers/md/raid5.c       | 2 +-
 7 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 3d9fd74233dfd..1e26eb2233495 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -952,8 +952,8 @@ static int join(struct mddev *mddev, int nodes)
 	return 0;
 err:
 	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
-	md_unregister_thread(&cinfo->recovery_thread);
-	md_unregister_thread(&cinfo->recv_thread);
+	md_unregister_thread(mddev, &cinfo->recovery_thread);
+	md_unregister_thread(mddev, &cinfo->recv_thread);
 	lockres_free(cinfo->message_lockres);
 	lockres_free(cinfo->token_lockres);
 	lockres_free(cinfo->ack_lockres);
@@ -1015,8 +1015,8 @@ static int leave(struct mddev *mddev)
 		resync_bitmap(mddev);
 
 	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
-	md_unregister_thread(&cinfo->recovery_thread);
-	md_unregister_thread(&cinfo->recv_thread);
+	md_unregister_thread(mddev, &cinfo->recovery_thread);
+	md_unregister_thread(mddev, &cinfo->recv_thread);
 	lockres_free(cinfo->message_lockres);
 	lockres_free(cinfo->token_lockres);
 	lockres_free(cinfo->ack_lockres);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a3d98273b295c..5c3c19b8d5099 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6258,7 +6258,7 @@ static void mddev_detach(struct mddev *mddev)
 		mddev->pers->quiesce(mddev, 1);
 		mddev->pers->quiesce(mddev, 0);
 	}
-	md_unregister_thread(&mddev->thread);
+	md_unregister_thread(mddev, &mddev->thread);
 	if (mddev->queue)
 		blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 }
@@ -7990,9 +7990,10 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
 }
 EXPORT_SYMBOL(md_register_thread);
 
-void md_unregister_thread(struct md_thread __rcu **threadp)
+void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
 {
-	struct md_thread *thread = rcu_dereference_protected(*threadp, true);
+	struct md_thread *thread = rcu_dereference_protected(*threadp,
+					lockdep_is_held(&mddev->reconfig_mutex));
 
 	if (!thread)
 		return;
@@ -9484,7 +9485,7 @@ void md_reap_sync_thread(struct mddev *mddev)
 	bool is_reshaped = false;
 
 	/* resync has finished, collect result */
-	md_unregister_thread(&mddev->sync_thread);
+	md_unregister_thread(mddev, &mddev->sync_thread);
 	atomic_inc(&mddev->sync_seq);
 
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8ae9574809763..9bcb77bca9639 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -761,7 +761,7 @@ extern struct md_thread *md_register_thread(
 	void (*run)(struct md_thread *thread),
 	struct mddev *mddev,
 	const char *name);
-extern void md_unregister_thread(struct md_thread __rcu **threadp);
+extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp);
 extern void md_wakeup_thread(struct md_thread __rcu *thread);
 extern void md_check_recovery(struct mddev *mddev);
 extern void md_reap_sync_thread(struct mddev *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b920f92780139..c18b7c096c8df 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3156,7 +3156,7 @@ static int raid1_run(struct mddev *mddev)
 	 * RAID1 needs at least one disk in active
 	 */
 	if (conf->raid_disks - mddev->degraded < 1) {
-		md_unregister_thread(&conf->thread);
+		md_unregister_thread(mddev, &conf->thread);
 		ret = -EINVAL;
 		goto abort;
 	}
@@ -3183,7 +3183,7 @@ static int raid1_run(struct mddev *mddev)
 
 	ret = md_integrity_register(mddev);
 	if (ret) {
-		md_unregister_thread(&mddev->thread);
+		md_unregister_thread(mddev, &mddev->thread);
 		goto abort;
 	}
 	return 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7704a4c7f4695..0234131208516 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4320,7 +4320,7 @@ static int raid10_run(struct mddev *mddev)
 	return 0;
 
 out_free_conf:
-	md_unregister_thread(&mddev->thread);
+	md_unregister_thread(mddev, &mddev->thread);
 	raid10_free_conf(conf);
 	mddev->private = NULL;
 out:
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2eac4a50d99bd..a29b9650260a0 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -3168,7 +3168,7 @@ void r5l_exit_log(struct r5conf *conf)
 {
 	struct r5l_log *log = conf->log;
 
-	md_unregister_thread(&log->reclaim_thread);
+	md_unregister_thread(conf->mddev, &log->reclaim_thread);
 
 	/*
 	 * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 32a87193bad73..4cb9c608ee191 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8107,7 +8107,7 @@ static int raid5_run(struct mddev *mddev)
 
 	return 0;
 abort:
-	md_unregister_thread(&mddev->thread);
+	md_unregister_thread(mddev, &mddev->thread);
 	print_raid5_conf(conf);
 	free_conf(conf);
 	mddev->private = NULL;

From 5afcf28d07dee91e48d1c809ebd19c3bfc403765 Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Mon, 31 Jul 2023 18:49:07 +0800
Subject: [PATCH 091/113] raid6: remove the <linux/export.h> include from
 recov.c

There is no exported symbol left in recov.c, so the include is now
unnecessary, and breaks the raid6test build. Remove it.

Signed-off-by: WANG Xuerui <git@xen0n.name>
Link: https://lore.kernel.org/r/20230731104911.411964-2-kernel@xen0n.name
Signed-off-by: Song Liu <song@kernel.org>
---
 lib/raid6/recov.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c
index e49d519de6cbe..a7c1b2bbe40d8 100644
--- a/lib/raid6/recov.c
+++ b/lib/raid6/recov.c
@@ -13,7 +13,6 @@
  * the syndrome.)
  */
 
-#include <linux/export.h>
 #include <linux/raid/pq.h>
 
 /* Recover two failed data blocks. */

From 9dd6e1da811ffad95a79b2690110ef1bbaf4dda4 Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Mon, 31 Jul 2023 18:49:08 +0800
Subject: [PATCH 092/113] raid6: guard the tables.c include of <linux/export.h>
 with __KERNEL__

The export directives for the tables are already emitted with __KERNEL__
guards, but the <linux/export.h> include is not, causing errors when
building the raid6test program. Guard this include too to fix the
raid6test build.

Signed-off-by: WANG Xuerui <git@xen0n.name>
Link: https://lore.kernel.org/r/20230731104911.411964-3-kernel@xen0n.name
Signed-off-by: Song Liu <song@kernel.org>
---
 lib/raid6/mktables.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index f02e10fa62381..3be03793237c2 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -56,7 +56,9 @@ int main(int argc, char *argv[])
 	uint8_t v;
 	uint8_t exptbl[256], invtbl[256];
 
+	printf("#ifdef __KERNEL__\n");
 	printf("#include <linux/export.h>\n");
+	printf("#endif\n");
 	printf("#include <linux/raid/pq.h>\n");
 
 	/* Compute multiplication table */

From 2008d89fb6435a3a900b72b856a18e0cc0d2c057 Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Mon, 31 Jul 2023 18:49:09 +0800
Subject: [PATCH 093/113] raid6: test: cosmetic cleanups for the test Makefile

Use tabs/spaces consistently: hard tabs for marking recipe lines only,
spaces for everything else.

Also, the OPTFLAGS declaration actually included the tabs preceding the
line comment, making compiler invocation lines unnecessarily long. As
the entire block of declarations are meant for ad-hoc customization
(otherwise they would probably make use of `?=` instead of `=`), move
the "Adjust as desired" comment above the block too to fix the long
invocation lines.

Signed-off-by: WANG Xuerui <git@xen0n.name>
Link: https://lore.kernel.org/r/20230731104911.411964-4-kernel@xen0n.name
Signed-off-by: Song Liu <song@kernel.org>
---
 lib/raid6/test/Makefile | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 4fb7700a741bd..143cda60faa12 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -6,14 +6,15 @@
 
 pound := \#
 
-CC	 = gcc
-OPTFLAGS = -O2			# Adjust as desired
-CFLAGS	 = -I.. -I ../../../include -g $(OPTFLAGS)
-LD	 = ld
-AWK	 = awk -f
-AR	 = ar
-RANLIB	 = ranlib
-OBJS	 = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o
+# Adjust as desired
+CC       = gcc
+OPTFLAGS = -O2
+CFLAGS   = -I.. -I ../../../include -g $(OPTFLAGS)
+LD       = ld
+AWK      = awk -f
+AR       = ar
+RANLIB   = ranlib
+OBJS     = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o
 
 ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/)
 ifeq ($(ARCH),i386)
@@ -37,9 +38,9 @@ endif
 ifeq ($(IS_X86),yes)
         OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
         CFLAGS += -DCONFIG_X86
-	CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" |          \
-		    gcc -c -x assembler - >/dev/null 2>&1 &&	\
-		    rm ./-.o && echo -DCONFIG_AS_AVX512=1)
+        CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" |          \
+                    gcc -c -x assembler - >/dev/null 2>&1 &&    \
+                    rm ./-.o && echo -DCONFIG_AS_AVX512=1)
 else ifeq ($(HAS_NEON),yes)
         OBJS   += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
         CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
@@ -63,12 +64,12 @@ endif
 %.uc: ../%.uc
 	cp -f $< $@
 
-all:	raid6.a raid6test
+all: raid6.a raid6test
 
 raid6.a: $(OBJS)
-	 rm -f $@
-	 $(AR) cq $@ $^
-	 $(RANLIB) $@
+	rm -f $@
+	$(AR) cq $@ $^
+	$(RANLIB) $@
 
 raid6test: test.c raid6.a
 	$(CC) $(CFLAGS) -o raid6test $^

From 6601f5e122e5fdcea0fa5eaa54b88b02dbc9ec07 Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Mon, 31 Jul 2023 18:49:10 +0800
Subject: [PATCH 094/113] raid6: test: make sure all intermediate and artifact
 files are .gitignored

Currently when the raid6test utility is built, the resulting binary and
an int.uc file are not being ignored, which can get inadvertently
committed as a result when one works on the raid6 code. Ignore them to
make `git status` clean at all times.

Signed-off-by: WANG Xuerui <git@xen0n.name>
Link: https://lore.kernel.org/r/20230731104911.411964-5-kernel@xen0n.name
Signed-off-by: Song Liu <song@kernel.org>
---
 lib/raid6/test/.gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 lib/raid6/test/.gitignore

diff --git a/lib/raid6/test/.gitignore b/lib/raid6/test/.gitignore
new file mode 100644
index 0000000000000..1b68a77f348f6
--- /dev/null
+++ b/lib/raid6/test/.gitignore
@@ -0,0 +1,3 @@
+/int.uc
+/neon.uc
+/raid6test

From 7b3c70c43c13ad8e59f5561b154663d6bdb77021 Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Mon, 31 Jul 2023 18:49:11 +0800
Subject: [PATCH 095/113] raid6: test: only check for Altivec if building on
 powerpc hosts

Altivec is only available for powerpc hosts, so only check for its
availability when the host is powerpc, to avoid error messages being
shown on architectures other than x86, arm or powerpc.

Signed-off-by: WANG Xuerui <git@xen0n.name>
Link: https://lore.kernel.org/r/20230731104911.411964-6-kernel@xen0n.name
Signed-off-by: Song Liu <song@kernel.org>
---
 lib/raid6/test/Makefile | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 143cda60faa12..1f693ea3b980c 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -35,6 +35,12 @@ ifeq ($(ARCH),aarch64)
         HAS_NEON = yes
 endif
 
+ifeq ($(findstring ppc,$(ARCH)),ppc)
+        CFLAGS += -I../../../arch/powerpc/include
+        HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\
+                         gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
+endif
+
 ifeq ($(IS_X86),yes)
         OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
         CFLAGS += -DCONFIG_X86
@@ -44,15 +50,10 @@ ifeq ($(IS_X86),yes)
 else ifeq ($(HAS_NEON),yes)
         OBJS   += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
         CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
-else
-        HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\
-                         gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
-        ifeq ($(HAS_ALTIVEC),yes)
-                CFLAGS += -I../../../arch/powerpc/include
-                CFLAGS += -DCONFIG_ALTIVEC
-                OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
-                        vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
-        endif
+else ifeq ($(HAS_ALTIVEC),yes)
+        CFLAGS += -DCONFIG_ALTIVEC
+        OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
+                vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
 endif
 
 .c.o:

From 0d0bd28c500173bfca78aa840f8f36d261ef1765 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 8 Aug 2023 18:49:12 +0800
Subject: [PATCH 096/113] md/raid5-cache: fix null-ptr-deref for
 r5l_flush_stripe_to_raid()

r5l_flush_stripe_to_raid() will check if the list 'flushing_ios' is
empty, and then submit 'flush_bio', however, r5l_log_flush_endio()
is clearing the list first and then clear the bio, which will cause
null-ptr-deref:

T1: submit flush io
raid5d
 handle_active_stripes
  r5l_flush_stripe_to_raid
   // list is empty
   // add 'io_end_ios' to the list
   bio_init
   submit_bio
   // io1

T2: io1 is done
r5l_log_flush_endio
 list_splice_tail_init
 // clear the list
			T3: submit new flush io
			...
			r5l_flush_stripe_to_raid
			 // list is empty
			 // add 'io_end_ios' to the list
			 bio_init
 bio_uninit
 // clear bio->bi_blkg
			 submit_bio
			 // null-ptr-deref

Fix this problem by clearing bio before clearing the list in
r5l_log_flush_endio().

Fixes: 0dd00cba99c3 ("raid5-cache: fully initialize flush_bio when needed")
Reported-and-tested-by: Corey Hickey <bugfood-ml@fatooh.org>
Closes: https://lore.kernel.org/all/cddd7213-3dfd-4ab7-a3ac-edd54d74a626@fatooh.org/
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid5-cache.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index a29b9650260a0..518b7cfa78b9d 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1260,14 +1260,13 @@ static void r5l_log_flush_endio(struct bio *bio)
 
 	if (bio->bi_status)
 		md_error(log->rdev->mddev, log->rdev);
+	bio_uninit(bio);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	list_for_each_entry(io, &log->flushing_ios, log_sibling)
 		r5l_io_run_stripes(io);
 	list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
-
-	bio_uninit(bio);
 }
 
 /*

From f4283bc7e38ac89d0c6c0ae188464d3769bec098 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Wed, 16 Aug 2023 10:22:10 +0800
Subject: [PATCH 097/113] drivers/rnbd: restore sysfs interface to rnbd-client

Commit 137380c0ec40 renamed 'rnbd-client' to 'rnbd_client', this changed
sysfs interface to /sys/devices/virtual/rnbd_client/ctl/map_device
from /sys/devices/virtual/rnbd-client/ctl/map_device.

CC: Ivan Orlov <ivan.orlov0322@gmail.com>
CC: "Md. Haris Iqbal" <haris.iqbal@ionos.com>
CC: Jack Wang <jinpu.wang@ionos.com>
Fixes: 137380c0ec40 ("block/rnbd: make all 'class' structures const")
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Acked-by: Jack Wang <jinpu.wang@ionos.com>
Link: https://lore.kernel.org/r/20230816022210.2501228-1-lizhijian@fujitsu.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rnbd/rnbd-clt-sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c
index c36d8b1ceeed7..39887556cf959 100644
--- a/drivers/block/rnbd/rnbd-clt-sysfs.c
+++ b/drivers/block/rnbd/rnbd-clt-sysfs.c
@@ -25,7 +25,7 @@
 
 static struct device *rnbd_dev;
 static const struct class rnbd_dev_class = {
-	.name = "rnbd_client",
+	.name = "rnbd-client",
 };
 static struct kobject *rnbd_devs_kobj;
 

From ec14a87ee1999b19d8b7ed0fa95fea80644624ae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Aug 2023 09:56:23 -1000
Subject: [PATCH 098/113] blk-cgroup: Fix NULL deref caused by blkg_policy_data
 being installed before init

blk-iocost sometimes causes the following crash:

  BUG: kernel NULL pointer dereference, address: 00000000000000e0
  ...
  RIP: 0010:_raw_spin_lock+0x17/0x30
  Code: be 01 02 00 00 e8 79 38 39 ff 31 d2 89 d0 5d c3 0f 1f 00 0f 1f 44 00 00 55 48 89 e5 65 ff 05 48 d0 34 7e b9 01 00 00 00 31 c0 <f0> 0f b1 0f 75 02 5d c3 89 c6 e8 ea 04 00 00 5d c3 0f 1f 84 00 00
  RSP: 0018:ffffc900023b3d40 EFLAGS: 00010046
  RAX: 0000000000000000 RBX: 00000000000000e0 RCX: 0000000000000001
  RDX: ffffc900023b3d20 RSI: ffffc900023b3cf0 RDI: 00000000000000e0
  RBP: ffffc900023b3d40 R08: ffffc900023b3c10 R09: 0000000000000003
  R10: 0000000000000064 R11: 000000000000000a R12: ffff888102337000
  R13: fffffffffffffff2 R14: ffff88810af408c8 R15: ffff8881070c3600
  FS:  00007faaaf364fc0(0000) GS:ffff88842fdc0000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000000000e0 CR3: 00000001097b1000 CR4: 0000000000350ea0
  Call Trace:
   <TASK>
   ioc_weight_write+0x13d/0x410
   cgroup_file_write+0x7a/0x130
   kernfs_fop_write_iter+0xf5/0x170
   vfs_write+0x298/0x370
   ksys_write+0x5f/0xb0
   __x64_sys_write+0x1b/0x20
   do_syscall_64+0x3d/0x80
   entry_SYSCALL_64_after_hwframe+0x46/0xb0

This happens because iocg->ioc is NULL. The field is initialized by
ioc_pd_init() and never cleared. The NULL deref is caused by
blkcg_activate_policy() installing blkg_policy_data before initializing it.

blkcg_activate_policy() was doing the following:

1. Allocate pd's for all existing blkg's and install them in blkg->pd[].
2. Initialize all pd's.
3. Online all pd's.

blkcg_activate_policy() only grabs the queue_lock and may release and
re-acquire the lock as allocation may need to sleep. ioc_weight_write()
grabs blkcg->lock and iterates all its blkg's. The two can race and if
ioc_weight_write() runs during #1 or between #1 and #2, it can encounter a
pd which is not initialized yet, leading to crash.

The crash can be reproduced with the following script:

  #!/bin/bash

  echo +io > /sys/fs/cgroup/cgroup.subtree_control
  systemd-run --unit touch-sda --scope dd if=/dev/sda of=/dev/null bs=1M count=1 iflag=direct
  echo 100 > /sys/fs/cgroup/system.slice/io.weight
  bash -c "echo '8:0 enable=1' > /sys/fs/cgroup/io.cost.qos" &
  sleep .2
  echo 100 > /sys/fs/cgroup/system.slice/io.weight

with the following patch applied:

> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index fc49be622e05..38d671d5e10c 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -1553,6 +1553,12 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
> 		pd->online = false;
> 	}
>
> +       if (system_state == SYSTEM_RUNNING) {
> +               spin_unlock_irq(&q->queue_lock);
> +               ssleep(1);
> +               spin_lock_irq(&q->queue_lock);
> +       }
> +
> 	/* all allocated, init in the same order */
> 	if (pol->pd_init_fn)
> 		list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)

I don't see a reason why all pd's should be allocated, initialized and
onlined together. The only ordering requirement is that parent blkgs to be
initialized and onlined before children, which is guaranteed from the
walking order. Let's fix the bug by allocating, initializing and onlining pd
for each blkg and holding blkcg->lock over initialization and onlining. This
ensures that an installed blkg is always fully initialized and onlined
removing the the race window.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Breno Leitao <leitao@debian.org>
Fixes: 9d179b865449 ("blkcg: Fix multiple bugs in blkcg_activate_policy()")
Link: https://lore.kernel.org/r/ZN0p5_W-Q9mAHBVY@slm.duckdns.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fc49be622e05b..638400bf049f2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1509,7 +1509,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 retry:
 	spin_lock_irq(&q->queue_lock);
 
-	/* blkg_list is pushed at the head, reverse walk to allocate parents first */
+	/* blkg_list is pushed at the head, reverse walk to initialize parents first */
 	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
 		struct blkg_policy_data *pd;
 
@@ -1547,21 +1547,20 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 				goto enomem;
 		}
 
-		blkg->pd[pol->plid] = pd;
+		spin_lock(&blkg->blkcg->lock);
+
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
-		pd->online = false;
-	}
+		blkg->pd[pol->plid] = pd;
 
-	/* all allocated, init in the same order */
-	if (pol->pd_init_fn)
-		list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
-			pol->pd_init_fn(blkg->pd[pol->plid]);
+		if (pol->pd_init_fn)
+			pol->pd_init_fn(pd);
 
-	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
 		if (pol->pd_online_fn)
-			pol->pd_online_fn(blkg->pd[pol->plid]);
-		blkg->pd[pol->plid]->online = true;
+			pol->pd_online_fn(pd);
+		pd->online = true;
+
+		spin_unlock(&blkg->blkcg->lock);
 	}
 
 	__set_bit(pol->plid, q->blkcg_pols);
@@ -1578,14 +1577,19 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	return ret;
 
 enomem:
-	/* alloc failed, nothing's initialized yet, free everything */
+	/* alloc failed, take down everything */
 	spin_lock_irq(&q->queue_lock);
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
 		struct blkcg *blkcg = blkg->blkcg;
+		struct blkg_policy_data *pd;
 
 		spin_lock(&blkcg->lock);
-		if (blkg->pd[pol->plid]) {
-			pol->pd_free_fn(blkg->pd[pol->plid]);
+		pd = blkg->pd[pol->plid];
+		if (pd) {
+			if (pd->online && pol->pd_offline_fn)
+				pol->pd_offline_fn(pd);
+			pd->online = false;
+			pol->pd_free_fn(pd);
 			blkg->pd[pol->plid] = NULL;
 		}
 		spin_unlock(&blkcg->lock);

From c5d736f548ec5aab7e877872417ac23a5c42f1fd Mon Sep 17 00:00:00 2001
From: Xueshi Hu <xueshi.hu@smartx.com>
Date: Mon, 14 Aug 2023 21:53:54 +0800
Subject: [PATCH 099/113] md/raid1: call free_r1bio() before allow_barrier() in
 raid_end_bio_io()

After allow_barrier, a concurrent raid1_reshape() will replace old mempool
and r1conf::raid_disks. Move allow_barrier() to the end of raid_end_bio_io(),
so that r1bio can be freed safely.

Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Xueshi Hu <xueshi.hu@smartx.com>
Link: https://lore.kernel.org/r/20230814135356.1113639-2-xueshi.hu@smartx.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c18b7c096c8df..642c8bae0df0c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -311,6 +311,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
 	struct r1conf *conf = r1_bio->mddev->private;
+	sector_t sector = r1_bio->sector;
 
 	/* if nobody has done the final endio yet, do it now */
 	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
@@ -321,13 +322,13 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
 
 		call_bio_endio(r1_bio);
 	}
+
+	free_r1bio(r1_bio);
 	/*
 	 * Wake up any possible resync thread that waits for the device
 	 * to go idle.  All I/Os, even write-behind writes, are done.
 	 */
-	allow_barrier(conf, r1_bio->sector);
-
-	free_r1bio(r1_bio);
+	allow_barrier(conf, sector);
 }
 
 /*

From 992db13a4aee766c8bfbf046ad15c2db5fa7cab8 Mon Sep 17 00:00:00 2001
From: Xueshi Hu <xueshi.hu@smartx.com>
Date: Mon, 14 Aug 2023 21:53:55 +0800
Subject: [PATCH 100/113] md/raid1: free the r1bio before waiting for blocked
 rdev

Raid1 reshape will change mempool and r1conf::raid_disks which are
needed to free r1bio. allow_barrier() make a concurrent raid1_reshape()
possible. So, free the in-flight r1bio before waiting blocked rdev.

Fixes: 6bfe0b499082 ("md: support blocking writes to an array on device failure")
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Xueshi Hu <xueshi.hu@smartx.com>
Link: https://lore.kernel.org/r/20230814135356.1113639-3-xueshi.hu@smartx.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 642c8bae0df0c..b3fc44157cfcc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1378,6 +1378,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		return;
 	}
 
+ retry_write:
 	r1_bio = alloc_r1bio(mddev, bio);
 	r1_bio->sectors = max_write_sectors;
 
@@ -1393,7 +1394,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 */
 
 	disks = conf->raid_disks * 2;
- retry_write:
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	max_sectors = r1_bio->sectors;
@@ -1473,7 +1473,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		for (j = 0; j < i; j++)
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
-		r1_bio->state = 0;
+		free_r1bio(r1_bio);
 		allow_barrier(conf, bio->bi_iter.bi_sector);
 
 		if (bio->bi_opf & REQ_NOWAIT) {

From c069da449a13669ffa754fd971747e7e17e7d691 Mon Sep 17 00:00:00 2001
From: Xueshi Hu <xueshi.hu@smartx.com>
Date: Mon, 14 Aug 2023 21:53:56 +0800
Subject: [PATCH 101/113] md/raid1: hold the barrier until handle_read_error()
 finishes

handle_read_error() will call allow_barrier() to match the former barrier
raising. However, it should put the allow_barrier() at the end to avoid a
concurrent raid reshape.

Fixes: 689389a06ce7 ("md/raid1: simplify handle_read_error().")
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Xueshi Hu <xueshi.hu@smartx.com>
Link: https://lore.kernel.org/r/20230814135356.1113639-4-xueshi.hu@smartx.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b3fc44157cfcc..56f2725a996ff 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2511,6 +2511,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	struct mddev *mddev = conf->mddev;
 	struct bio *bio;
 	struct md_rdev *rdev;
+	sector_t sector;
 
 	clear_bit(R1BIO_ReadError, &r1_bio->state);
 	/* we got a read error. Maybe the drive is bad.  Maybe just
@@ -2540,12 +2541,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	}
 
 	rdev_dec_pending(rdev, conf->mddev);
-	allow_barrier(conf, r1_bio->sector);
+	sector = r1_bio->sector;
 	bio = r1_bio->master_bio;
 
 	/* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
 	r1_bio->state = 0;
 	raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
+	allow_barrier(conf, sector);
 }
 
 static void raid1d(struct md_thread *thread)

From 6b2460e66ce6d483b5ff77227ac799d6e8a9ebd6 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Mon, 14 Aug 2023 14:01:15 +0200
Subject: [PATCH 102/113] md raid1: allow writebehind to work on any leg device
 set WriteMostly

As the WriteMostly flag can be set on any component device of a RAID1
array, remove the constraint that it only works if set on the first one.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Tested-by: Xiao Ni <xni@redhat.com>
Link: https://lore.kernel.org/r/2a9592bf3340f34bf588eec984b23ee219f3985e.1692013451.git.heinzm@redhat.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 56f2725a996ff..4b30a17421623 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1523,8 +1523,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			 * Not if there are too many, or cannot
 			 * allocate memory, or a reader on WriteMostly
 			 * is waiting for behind writes to flush */
-			if (bitmap &&
-			    test_bit(WriteMostly, &rdev->flags) &&
+			if (bitmap && write_behind &&
 			    (atomic_read(&bitmap->behind_writes)
 			     < mddev->bitmap_info.max_write_behind) &&
 			    !waitqueue_active(&bitmap->behind_wait)) {

From af50e20afb401cc203bd2a9ff62ece0ae4976103 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 14 Aug 2023 11:27:07 +0200
Subject: [PATCH 103/113] md/raid0: Factor out helper for mapping and
 submitting a bio

Factor out helper function for mapping and submitting a bio out of
raid0_make_request(). We will use it later for submitting both parts of
a split bio.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230814092720.3931-1-jack@suse.cz
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid0.c | 79 +++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 4106d943aae75..91b24c510b7f6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -545,54 +545,21 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 	bio_endio(bio);
 }
 
-static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
+static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
 {
 	struct r0conf *conf = mddev->private;
 	struct strip_zone *zone;
 	struct md_rdev *tmp_dev;
-	sector_t bio_sector;
-	sector_t sector;
-	sector_t orig_sector;
-	unsigned chunk_sects;
-	unsigned sectors;
-
-	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
-	    && md_flush_request(mddev, bio))
-		return true;
-
-	if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
-		raid0_handle_discard(mddev, bio);
-		return true;
-	}
-
-	bio_sector = bio->bi_iter.bi_sector;
-	sector = bio_sector;
-	chunk_sects = mddev->chunk_sectors;
-
-	sectors = chunk_sects -
-		(likely(is_power_of_2(chunk_sects))
-		 ? (sector & (chunk_sects-1))
-		 : sector_div(sector, chunk_sects));
-
-	/* Restore due to sector_div */
-	sector = bio_sector;
-
-	if (sectors < bio_sectors(bio)) {
-		struct bio *split = bio_split(bio, sectors, GFP_NOIO,
-					      &mddev->bio_set);
-		bio_chain(split, bio);
-		submit_bio_noacct(bio);
-		bio = split;
-	}
+	sector_t bio_sector = bio->bi_iter.bi_sector;
+	sector_t sector = bio_sector;
 
 	if (bio->bi_pool != &mddev->bio_set)
 		md_account_bio(mddev, &bio);
 
-	orig_sector = sector;
 	zone = find_zone(mddev->private, &sector);
 	switch (conf->layout) {
 	case RAID0_ORIG_LAYOUT:
-		tmp_dev = map_sector(mddev, zone, orig_sector, &sector);
+		tmp_dev = map_sector(mddev, zone, bio_sector, &sector);
 		break;
 	case RAID0_ALT_MULTIZONE_LAYOUT:
 		tmp_dev = map_sector(mddev, zone, sector, &sector);
@@ -600,13 +567,13 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 	default:
 		WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev));
 		bio_io_error(bio);
-		return true;
+		return;
 	}
 
 	if (unlikely(is_rdev_broken(tmp_dev))) {
 		bio_io_error(bio);
 		md_error(mddev, tmp_dev);
-		return true;
+		return;
 	}
 
 	bio_set_dev(bio, tmp_dev->bdev);
@@ -618,6 +585,40 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 				      bio_sector);
 	mddev_check_write_zeroes(mddev, bio);
 	submit_bio_noacct(bio);
+}
+
+static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
+{
+	sector_t sector;
+	unsigned chunk_sects;
+	unsigned sectors;
+
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+	    && md_flush_request(mddev, bio))
+		return true;
+
+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
+		raid0_handle_discard(mddev, bio);
+		return true;
+	}
+
+	sector = bio->bi_iter.bi_sector;
+	chunk_sects = mddev->chunk_sectors;
+
+	sectors = chunk_sects -
+		(likely(is_power_of_2(chunk_sects))
+		 ? (sector & (chunk_sects-1))
+		 : sector_div(sector, chunk_sects));
+
+	if (sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, sectors, GFP_NOIO,
+					      &mddev->bio_set);
+		bio_chain(split, bio);
+		submit_bio_noacct(bio);
+		bio = split;
+	}
+
+	raid0_map_submit_bio(mddev, bio);
 	return true;
 }
 

From 319ff40a542736d67e5bce18635de35d0e7a0bff Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 14 Aug 2023 11:27:08 +0200
Subject: [PATCH 104/113] md/raid0: Fix performance regression for large
 sequential writes

Commit f00d7c85be9e ("md/raid0: fix up bio splitting.") among other
things changed how bio that needs to be split is submitted. Before this
commit, we have split the bio, mapped and submitted each part. After
this commit, we map only the first part of the split bio and submit the
second part unmapped. Due to bio sorting in __submit_bio_noacct() this
results in the following request ordering:

  9,0   18     1181     0.525037895 15995  Q  WS 1479315464 + 63392

  Split off chunk-sized (1024 sectors) request:

  9,0   18     1182     0.629019647 15995  X  WS 1479315464 / 1479316488

  Request is unaligned to the chunk so it's split in
  raid0_make_request().  This is the first part mapped and punted to
  bio_list:

  8,0   18     7053     0.629020455 15995  A  WS 739921928 + 1016 <- (9,0) 1479315464

  Now raid0_make_request() returns, second part is postponed on
  bio_list. __submit_bio_noacct() resorts the bio_list, mapped request
  is submitted to the underlying device:

  8,0   18     7054     0.629022782 15995  G  WS 739921928 + 1016

  Now we take another request from the bio_list which is the remainder
  of the original huge request. Split off another chunk-sized bit from
  it and the situation repeats:

  9,0   18     1183     0.629024499 15995  X  WS 1479316488 / 1479317512
  8,16  18     6998     0.629025110 15995  A  WS 739921928 + 1016 <- (9,0) 1479316488
  8,16  18     6999     0.629026728 15995  G  WS 739921928 + 1016
  ...
  9,0   18     1184     0.629032940 15995  X  WS 1479317512 / 1479318536 [libnetacq-write]
  8,0   18     7059     0.629033294 15995  A  WS 739922952 + 1016 <- (9,0) 1479317512
  8,0   18     7060     0.629033902 15995  G  WS 739922952 + 1016
  ...

  This repeats until we consume the whole original huge request. Now we
  finally get to processing the second parts of the split off requests
  (in reverse order):

  8,16  18     7181     0.629161384 15995  A  WS 739952640 + 8 <- (9,0) 1479377920
  8,0   18     7239     0.629162140 15995  A  WS 739952640 + 8 <- (9,0) 1479376896
  8,16  18     7186     0.629163881 15995  A  WS 739951616 + 8 <- (9,0) 1479375872
  8,0   18     7242     0.629164421 15995  A  WS 739951616 + 8 <- (9,0) 1479374848
  ...

I guess it is obvious that this IO pattern is extremely inefficient way
to perform sequential IO. It also makes bio_list to grow to rather long
lengths.

Change raid0_make_request() to map both parts of the split bio. Since we
know we are provided with at most chunk-sized bios, we will always need
to split the incoming bio at most once.

Fixes: f00d7c85be9e ("md/raid0: fix up bio splitting.")
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230814092720.3931-2-jack@suse.cz
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 91b24c510b7f6..abbd77977f984 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -614,7 +614,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 		struct bio *split = bio_split(bio, sectors, GFP_NOIO,
 					      &mddev->bio_set);
 		bio_chain(split, bio);
-		submit_bio_noacct(bio);
+		raid0_map_submit_bio(mddev, bio);
 		bio = split;
 	}
 

From cc22b5407e9ca76adb7efeed843146510b1b72a5 Mon Sep 17 00:00:00 2001
From: David Jeffery <djeffery@redhat.com>
Date: Wed, 16 Aug 2023 14:13:55 -0400
Subject: [PATCH 105/113] md: raid0: account for split bio in iostat accounting

When a bio is split by md raid0, the newly created bio will not be tracked
by md for I/O accounting. Only the portion of I/O still assigned to the
original bio which was reduced by the split will be accounted for. This
results in md iostat data sometimes showing I/O values far below the actual
amount of data being sent through md.

md_account_bio() needs to be called for all bio generated by the bio split.

A simple example of the issue was generated using a raid0 device on partitions
to the same device. Since all raid0 I/O then goes to one device, it makes it
easy to see a gap between the md device and its sd storage. Reading an lvm
device on top of the md device, the iostat output (some 0 columns and extra
devices removed to make the data more compact) was:

Device             tps    kB_read/s    kB_wrtn/s    kB_dscd/s    kB_read
md2               0.00         0.00         0.00         0.00          0
sde               0.00         0.00         0.00         0.00          0
md2            1364.00    411496.00         0.00         0.00     411496
sde            1734.00    646144.00         0.00         0.00     646144
md2            1699.00    510680.00         0.00         0.00     510680
sde            2155.00    802784.00         0.00         0.00     802784
md2             803.00    241480.00         0.00         0.00     241480
sde            1016.00    377888.00         0.00         0.00     377888
md2               0.00         0.00         0.00         0.00          0
sde               0.00         0.00         0.00         0.00          0

I/O was generated doing large direct I/O reads (12M) with dd to a linear
lvm volume on top of the 4 leg raid0 device.

The md2 reads were showing as roughly 2/3 of the reads to the sde device
containing all of md2's raid partitions. The sum of reads to sde was
1826816 kB, which was the expected amount as it was the amount read by
dd. With the patch, the total reads from md will match the reads from
sde and be consistent with the amount of I/O generated.

Fixes: 10764815ff47 ("md: add io accounting for raid0 and raid5")
Signed-off-by: David Jeffery <djeffery@redhat.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Reviewed-by: Laurence Oberman <loberman@redhat.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230816181433.13289-1-djeffery@redhat.com
---
 drivers/md/raid0.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index abbd77977f984..c50a7abda744a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -553,8 +553,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
 	sector_t bio_sector = bio->bi_iter.bi_sector;
 	sector_t sector = bio_sector;
 
-	if (bio->bi_pool != &mddev->bio_set)
-		md_account_bio(mddev, &bio);
+	md_account_bio(mddev, &bio);
 
 	zone = find_zone(mddev->private, &sector);
 	switch (conf->layout) {

From 851e06297f20bbd85c93bbf09469f2150d1db218 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 10 Aug 2023 20:43:26 +0800
Subject: [PATCH 106/113] ublk: zoned: support REQ_OP_ZONE_RESET_ALL

There isn't any reason to not support REQ_OP_ZONE_RESET_ALL given everything
is actually handled in userspace, not mention it is pretty easy to support
RESET_ALL.

So enable REQ_OP_ZONE_RESET_ALL and let userspace handle it.

Verified by 'tools/zbc_reset_zone -all /dev/ublkb0' in libzbc[1] with
libublk-rs based ublk-zoned target prototype[2], follows command line
for creating ublk-zoned:

	cargo run --example zoned -- add -1 1024	# add $dev_id $DEV_SIZE

[1] https://github.com/westerndigitalcorporation/libzbc
[2] https://github.com/ming1/libublk-rs/tree/zoned.v2

Cc: Niklas Cassel <Niklas.Cassel@wdc.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230810124326.321472-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 7 +++++--
 include/uapi/linux/ublk_cmd.h | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6ecd728ee5cea..2d9cb59f8027f 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -251,6 +251,7 @@ static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
 	const struct ublk_param_zoned *p = &ub->params.zoned;
 
 	disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
+	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
 	blk_queue_required_elevator_features(ub->ub_disk->queue,
 					     ELEVATOR_F_ZBD_SEQ_WRITE);
 	disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
@@ -393,6 +394,9 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
 	case REQ_OP_ZONE_APPEND:
 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
 		break;
+	case REQ_OP_ZONE_RESET_ALL:
+		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
+		break;
 	case REQ_OP_DRV_IN:
 		ublk_op = pdu->operation;
 		switch (ublk_op) {
@@ -404,9 +408,8 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
 		default:
 			return BLK_STS_IOERR;
 		}
-	case REQ_OP_ZONE_RESET_ALL:
 	case REQ_OP_DRV_OUT:
-		/* We do not support reset_all and drv_out */
+		/* We do not support drv_out */
 		return BLK_STS_NOTSUPP;
 	default:
 		return BLK_STS_IOERR;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 2685e53e47521..b9cfc5c962682 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -245,6 +245,7 @@ struct ublksrv_ctrl_dev_info {
 #define		UBLK_IO_OP_ZONE_CLOSE		11
 #define		UBLK_IO_OP_ZONE_FINISH		12
 #define		UBLK_IO_OP_ZONE_APPEND		13
+#define		UBLK_IO_OP_ZONE_RESET_ALL	14
 #define		UBLK_IO_OP_ZONE_RESET		15
 /*
  * Construct a zone report. The report request is carried in `struct

From e1dd7bc93029024af5688253b0c05181d6e01f8e Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 21 Aug 2023 17:56:00 +0800
Subject: [PATCH 107/113] blk-mq: fix tags leak when shrink nr_hw_queues

Although we don't need to realloc set->tags[] when shrink nr_hw_queues,
we need to free them. Or these tags will be leaked.

How to reproduce:
1. mount -t configfs configfs /mnt
2. modprobe null_blk nr_devices=0 submit_queues=8
3. mkdir /mnt/nullb/nullb0
4. echo 1 > /mnt/nullb/nullb0/power
5. echo 4 > /mnt/nullb/nullb0/submit_queues
6. rmdir /mnt/nullb/nullb0

In step 4, will alloc 9 tags (8 submit queues and 1 poll queue), then
in step 5, new_nr_hw_queues = 5 (4 submit queues and 1 poll queue).
At last in step 6, only these 5 tags are freed, the other 4 tags leaked.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230821095602.70742-1-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 687ec3f4f10d2..afad6d06eaf7e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4379,9 +4379,13 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
 				       int new_nr_hw_queues)
 {
 	struct blk_mq_tags **new_tags;
+	int i;
 
-	if (set->nr_hw_queues >= new_nr_hw_queues)
+	if (set->nr_hw_queues >= new_nr_hw_queues) {
+		for (i = new_nr_hw_queues; i < set->nr_hw_queues; i++)
+			__blk_mq_free_map_and_rqs(set, i);
 		goto done;
+	}
 
 	new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
 				GFP_KERNEL, set->numa_node);

From 2bc4d7a355a4d617452eaf1b21d6d261194b3667 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 21 Aug 2023 17:56:01 +0800
Subject: [PATCH 108/113] blk-mq: delete redundant tagset map update when
 fallback

When we increase nr_hw_queues fail, the fallback path will use
blk_mq_update_queue_map() to clear and update all maps.
Obviously, this line of update of HCTX_TYPE_DEFAULT only is not
needed, so delete it.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230821095602.70742-2-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index afad6d06eaf7e..22397ba815ca9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4730,7 +4730,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 				__blk_mq_free_map_and_rqs(set, i);
 
 			set->nr_hw_queues = prev_nr_hw_queues;
-			blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
 			goto fallback;
 		}
 		blk_mq_map_swqueue(q);

From 7222657e51b5626d10154b3e48ad441c33b5da96 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Mon, 21 Aug 2023 17:56:02 +0800
Subject: [PATCH 109/113] blk-mq: prealloc tags when increase tagset
 nr_hw_queues

Just like blk_mq_alloc_tag_set(), it's better to prepare all tags before
using to map to queue ctxs in blk_mq_map_swqueue(), which now have to
consider empty set->tags[].

The good point is that we can fallback easily if increasing nr_hw_queues
fail, instead of just mapping to hctx[0] when fail in blk_mq_map_swqueue().

And the fallback path already has tags free & clean handling, so all
is good.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230821095602.70742-3-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 22397ba815ca9..84400157c5f4e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4397,6 +4397,16 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
 		       sizeof(*set->tags));
 	kfree(set->tags);
 	set->tags = new_tags;
+
+	for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
+		if (!__blk_mq_alloc_map_and_rqs(set, i)) {
+			while (--i >= set->nr_hw_queues)
+				__blk_mq_free_map_and_rqs(set, i);
+			return -ENOMEM;
+		}
+		cond_resched();
+	}
+
 done:
 	set->nr_hw_queues = new_nr_hw_queues;
 	return 0;

From 9fb10726ecc5145550180aec4fd0adf0a7b1d634 Mon Sep 17 00:00:00 2001
From: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Date: Fri, 21 Jul 2023 16:15:32 -0500
Subject: [PATCH 110/113] block: sed-opal: Implement IOC_OPAL_DISCOVERY

Add IOC_OPAL_DISCOVERY ioctl to return raw discovery data to a SED Opal
application. This allows the application to display drive capabilities
and state.

Signed-off-by: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jonathan Derrick <jonathan.derrick@linux.dev>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lore.kernel.org/r/20230721211534.3437070-2-gjoyce@linux.vnet.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/sed-opal.c              | 38 ++++++++++++++++++++++++++++++++---
 include/linux/sed-opal.h      |  1 +
 include/uapi/linux/sed-opal.h |  6 ++++++
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index c18339446ef37..67c6c4f2b4b0f 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -463,8 +463,11 @@ static int execute_steps(struct opal_dev *dev,
 	return error;
 }
 
-static int opal_discovery0_end(struct opal_dev *dev)
+static int opal_discovery0_end(struct opal_dev *dev, void *data)
 {
+	struct opal_discovery *discv_out = data; /* may be NULL */
+	u8 __user *buf_out;
+	u64 len_out;
 	bool found_com_id = false, supported = true, single_user = false;
 	const struct d0_header *hdr = (struct d0_header *)dev->resp;
 	const u8 *epos = dev->resp, *cpos = dev->resp;
@@ -480,6 +483,15 @@ static int opal_discovery0_end(struct opal_dev *dev)
 		return -EFAULT;
 	}
 
+	if (discv_out) {
+		buf_out = (u8 __user *)(uintptr_t)discv_out->data;
+		len_out = min_t(u64, discv_out->size, hlen);
+		if (buf_out && copy_to_user(buf_out, dev->resp, len_out))
+			return -EFAULT;
+
+		discv_out->size = hlen; /* actual size of data */
+	}
+
 	epos += hlen; /* end of buffer */
 	cpos += sizeof(*hdr); /* current position on buffer */
 
@@ -565,13 +577,13 @@ static int opal_discovery0(struct opal_dev *dev, void *data)
 	if (ret)
 		return ret;
 
-	return opal_discovery0_end(dev);
+	return opal_discovery0_end(dev, data);
 }
 
 static int opal_discovery0_step(struct opal_dev *dev)
 {
 	const struct opal_step discovery0_step = {
-		opal_discovery0,
+		opal_discovery0, NULL
 	};
 
 	return execute_step(dev, &discovery0_step, 0);
@@ -2435,6 +2447,22 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
 	return ret;
 }
 
+static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv)
+{
+	const struct opal_step discovery0_step = {
+		opal_discovery0, discv
+	};
+	int ret = 0;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	ret = execute_step(dev, &discovery0_step, 0);
+	mutex_unlock(&dev->dev_lock);
+	if (ret)
+		return ret;
+	return discv->size; /* modified to actual length of data */
+}
+
 static int opal_erase_locking_range(struct opal_dev *dev,
 				    struct opal_session_info *opal_session)
 {
@@ -3056,6 +3084,10 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_GET_GEOMETRY:
 		ret = opal_get_geometry(dev, arg);
 		break;
+	case IOC_OPAL_DISCOVERY:
+		ret = opal_get_discv(dev, p);
+		break;
+
 	default:
 		break;
 	}
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index bbae1e52ab4f5..ef65f589fbebb 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -47,6 +47,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_GET_STATUS:
 	case IOC_OPAL_GET_LR_STATUS:
 	case IOC_OPAL_GET_GEOMETRY:
+	case IOC_OPAL_DISCOVERY:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index dc2efd345133f..7f5732c5bdc50 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -173,6 +173,11 @@ struct opal_geometry {
 	__u8  __align[3];
 };
 
+struct opal_discovery {
+	__u64 data;
+	__u64 size;
+};
+
 #define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK	    _IOW('p', 221, struct opal_lock_unlock)
 #define IOC_OPAL_TAKE_OWNERSHIP	    _IOW('p', 222, struct opal_key)
@@ -192,5 +197,6 @@ struct opal_geometry {
 #define IOC_OPAL_GET_STATUS         _IOR('p', 236, struct opal_status)
 #define IOC_OPAL_GET_LR_STATUS      _IOW('p', 237, struct opal_lr_status)
 #define IOC_OPAL_GET_GEOMETRY       _IOR('p', 238, struct opal_geometry)
+#define IOC_OPAL_DISCOVERY          _IOW('p', 239, struct opal_discovery)
 
 #endif /* _UAPI_SED_OPAL_H */

From 5c82efc1aee8eb0919aa67a0d2559de5a326bd7c Mon Sep 17 00:00:00 2001
From: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Date: Fri, 21 Jul 2023 16:15:33 -0500
Subject: [PATCH 111/113] block: sed-opal: Implement IOC_OPAL_REVERT_LSP

This is used in conjunction with IOC_OPAL_REVERT_TPR to return a drive to
Original Factory State without erasing the data. If IOC_OPAL_REVERT_LSP
is called with opal_revert_lsp.options bit OPAL_PRESERVE set prior
to calling IOC_OPAL_REVERT_TPR, the drive global locking range will not
be erased.

Signed-off-by: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jonathan Derrick <jonathan.derrick@linux.dev>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lore.kernel.org/r/20230721211534.3437070-3-gjoyce@linux.vnet.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/opal_proto.h            |  4 ++++
 block/sed-opal.c              | 40 +++++++++++++++++++++++++++++++++++
 include/linux/sed-opal.h      |  1 +
 include/uapi/linux/sed-opal.h | 11 ++++++++++
 4 files changed, 56 insertions(+)

diff --git a/block/opal_proto.h b/block/opal_proto.h
index a4e56845dd826..dec7ce3a3edb7 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -225,6 +225,10 @@ enum opal_parameter {
 	OPAL_SUM_SET_LIST = 0x060000,
 };
 
+enum opal_revertlsp {
+	OPAL_KEEP_GLOBAL_RANGE_KEY = 0x060000,
+};
+
 /* Packets derived from:
  * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
  * Secion: 3.2.3 ComPackets, Packets & Subpackets
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 67c6c4f2b4b0f..e2aed7f4ebdf6 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1769,6 +1769,26 @@ static int internal_activate_user(struct opal_dev *dev, void *data)
 	return finalize_and_send(dev, parse_and_check_status);
 }
 
+static int revert_lsp(struct opal_dev *dev, void *data)
+{
+	struct opal_revert_lsp *rev = data;
+	int err;
+
+	err = cmd_start(dev, opaluid[OPAL_THISSP_UID],
+			opalmethod[OPAL_REVERTSP]);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u64(&err, dev, OPAL_KEEP_GLOBAL_RANGE_KEY);
+	add_token_u8(&err, dev, (rev->options & OPAL_PRESERVE) ?
+			OPAL_TRUE : OPAL_FALSE);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	if (err) {
+		pr_debug("Error building REVERT SP command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
 static int erase_locking_range(struct opal_dev *dev, void *data)
 {
 	struct opal_session_info *session = data;
@@ -2463,6 +2483,23 @@ static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv)
 	return discv->size; /* modified to actual length of data */
 }
 
+static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev)
+{
+	/* controller will terminate session */
+	const struct opal_step steps[] = {
+		{ start_admin1LSP_opal_session, &rev->key },
+		{ revert_lsp, rev }
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	ret = execute_steps(dev, steps, ARRAY_SIZE(steps));
+	mutex_unlock(&dev->dev_lock);
+
+	return ret;
+}
+
 static int opal_erase_locking_range(struct opal_dev *dev,
 				    struct opal_session_info *opal_session)
 {
@@ -3084,6 +3121,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_GET_GEOMETRY:
 		ret = opal_get_geometry(dev, arg);
 		break;
+	case IOC_OPAL_REVERT_LSP:
+		ret = opal_revertlsp(dev, p);
+		break;
 	case IOC_OPAL_DISCOVERY:
 		ret = opal_get_discv(dev, p);
 		break;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index ef65f589fbebb..2f189546e1338 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -48,6 +48,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_GET_LR_STATUS:
 	case IOC_OPAL_GET_GEOMETRY:
 	case IOC_OPAL_DISCOVERY:
+	case IOC_OPAL_REVERT_LSP:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 7f5732c5bdc50..4e10675751b48 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -56,6 +56,10 @@ struct opal_key {
 	__u8 key[OPAL_KEY_MAX];
 };
 
+enum opal_revert_lsp_opts {
+	OPAL_PRESERVE = 0x01,
+};
+
 struct opal_lr_act {
 	struct opal_key key;
 	__u32 sum;
@@ -178,6 +182,12 @@ struct opal_discovery {
 	__u64 size;
 };
 
+struct opal_revert_lsp {
+	struct opal_key key;
+	__u32 options;
+	__u32 __pad;
+};
+
 #define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK	    _IOW('p', 221, struct opal_lock_unlock)
 #define IOC_OPAL_TAKE_OWNERSHIP	    _IOW('p', 222, struct opal_key)
@@ -198,5 +208,6 @@ struct opal_discovery {
 #define IOC_OPAL_GET_LR_STATUS      _IOW('p', 237, struct opal_lr_status)
 #define IOC_OPAL_GET_GEOMETRY       _IOR('p', 238, struct opal_geometry)
 #define IOC_OPAL_DISCOVERY          _IOW('p', 239, struct opal_discovery)
+#define IOC_OPAL_REVERT_LSP         _IOW('p', 240, struct opal_revert_lsp)
 
 #endif /* _UAPI_SED_OPAL_H */

From 3bfeb61256643281ac4be5b8a57e9d9da3db4335 Mon Sep 17 00:00:00 2001
From: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Date: Fri, 21 Jul 2023 16:15:34 -0500
Subject: [PATCH 112/113] block: sed-opal: keyring support for SED keys

Extend the SED block driver so it can alternatively
obtain a key from a sed-opal kernel keyring. The SED
ioctls will indicate the source of the key, either
directly in the ioctl data or from the keyring.

This allows the use of SED commands in scripts such as
udev scripts so that drives may be automatically unlocked
as they become available.

Signed-off-by: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Reviewed-by: Jonathan Derrick <jonathan.derrick@linux.dev>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lore.kernel.org/r/20230721211534.3437070-4-gjoyce@linux.vnet.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig                 |   2 +
 block/sed-opal.c              | 174 +++++++++++++++++++++++++++++++++-
 include/linux/sed-opal.h      |   3 +
 include/uapi/linux/sed-opal.h |   8 +-
 4 files changed, 184 insertions(+), 3 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 1a13ef0b1ca10..f1364d1c0d93e 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -184,6 +184,8 @@ config BLK_DEBUG_FS_ZONED
 
 config BLK_SED_OPAL
 	bool "Logic for interfacing with Opal enabled SEDs"
+	depends on KEYS
+	select PSERIES_PLPKS if PPC_PSERIES
 	help
 	Builds Logic for interfacing with Opal enabled controllers.
 	Enabling this option enables users to setup/unlock/lock
diff --git a/block/sed-opal.c b/block/sed-opal.c
index e2aed7f4ebdf6..6d7f25d1711ba 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -20,6 +20,9 @@
 #include <linux/sed-opal.h>
 #include <linux/string.h>
 #include <linux/kdev_t.h>
+#include <linux/key.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
 
 #include "opal_proto.h"
 
@@ -29,6 +32,8 @@
 /* Number of bytes needed by cmd_finalize. */
 #define CMD_FINALIZE_BYTES_NEEDED 7
 
+static struct key *sed_opal_keyring;
+
 struct opal_step {
 	int (*fn)(struct opal_dev *dev, void *data);
 	void *data;
@@ -269,6 +274,101 @@ static void print_buffer(const u8 *ptr, u32 length)
 #endif
 }
 
+/*
+ * Allocate/update a SED Opal key and add it to the SED Opal keyring.
+ */
+static int update_sed_opal_key(const char *desc, u_char *key_data, int keylen)
+{
+	key_ref_t kr;
+
+	if (!sed_opal_keyring)
+		return -ENOKEY;
+
+	kr = key_create_or_update(make_key_ref(sed_opal_keyring, true), "user",
+				  desc, (const void *)key_data, keylen,
+				  KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_WRITE,
+				  KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN |
+					KEY_ALLOC_BYPASS_RESTRICTION);
+	if (IS_ERR(kr)) {
+		pr_err("Error adding SED key (%ld)\n", PTR_ERR(kr));
+		return PTR_ERR(kr);
+	}
+
+	return 0;
+}
+
+/*
+ * Read a SED Opal key from the SED Opal keyring.
+ */
+static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen)
+{
+	int ret;
+	key_ref_t kref;
+	struct key *key;
+
+	if (!sed_opal_keyring)
+		return -ENOKEY;
+
+	kref = keyring_search(make_key_ref(sed_opal_keyring, true),
+			      &key_type_user, key_name, true);
+
+	if (IS_ERR(kref))
+		ret = PTR_ERR(kref);
+
+	key = key_ref_to_ptr(kref);
+	down_read(&key->sem);
+	ret = key_validate(key);
+	if (ret == 0) {
+		if (buflen > key->datalen)
+			buflen = key->datalen;
+
+		ret = key->type->read(key, (char *)buffer, buflen);
+	}
+	up_read(&key->sem);
+
+	key_ref_put(kref);
+
+	return ret;
+}
+
+static int opal_get_key(struct opal_dev *dev, struct opal_key *key)
+{
+	int ret = 0;
+
+	switch (key->key_type) {
+	case OPAL_INCLUDED:
+		/* the key is ready to use */
+		break;
+	case OPAL_KEYRING:
+		/* the key is in the keyring */
+		ret = read_sed_opal_key(OPAL_AUTH_KEY, key->key, OPAL_KEY_MAX);
+		if (ret > 0) {
+			if (ret > U8_MAX) {
+				ret = -ENOSPC;
+				goto error;
+			}
+			key->key_len = ret;
+			key->key_type = OPAL_INCLUDED;
+		}
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	if (ret < 0)
+		goto error;
+
+	/* must have a PEK by now or it's an error */
+	if (key->key_type != OPAL_INCLUDED || key->key_len == 0) {
+		ret = -EINVAL;
+		goto error;
+	}
+	return 0;
+error:
+	pr_debug("Error getting password: %d\n", ret);
+	return ret;
+}
+
 static bool check_tper(const void *data)
 {
 	const struct d0_tper_features *tper = data;
@@ -2459,6 +2559,9 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
 	};
 	int ret;
 
+	ret = opal_get_key(dev, &opal_session->opal_key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
@@ -2492,6 +2595,9 @@ static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev)
 	};
 	int ret;
 
+	ret = opal_get_key(dev, &rev->key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, steps, ARRAY_SIZE(steps));
@@ -2510,6 +2616,9 @@ static int opal_erase_locking_range(struct opal_dev *dev,
 	};
 	int ret;
 
+	ret = opal_get_key(dev, &opal_session->opal_key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
@@ -2538,6 +2647,9 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
 	    opal_mbr->enable_disable != OPAL_MBR_DISABLE)
 		return -EINVAL;
 
+	ret = opal_get_key(dev, &opal_mbr->key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
@@ -2563,6 +2675,9 @@ static int opal_set_mbr_done(struct opal_dev *dev,
 	    mbr_done->done_flag != OPAL_MBR_NOT_DONE)
 		return -EINVAL;
 
+	ret = opal_get_key(dev, &mbr_done->key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
@@ -2584,6 +2699,9 @@ static int opal_write_shadow_mbr(struct opal_dev *dev,
 	if (info->size == 0)
 		return 0;
 
+	ret = opal_get_key(dev, &info->key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
@@ -2641,6 +2759,9 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
 		return -EINVAL;
 	}
 
+	ret = opal_get_key(dev, &lk_unlk->session.opal_key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, steps, ARRAY_SIZE(steps));
@@ -2663,6 +2784,10 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psi
 
 	int ret;
 
+	ret = opal_get_key(dev, opal);
+
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	if (psid)
@@ -2763,6 +2888,9 @@ static int opal_lock_unlock(struct opal_dev *dev,
 	if (lk_unlk->session.who > OPAL_USER9)
 		return -EINVAL;
 
+	ret = opal_get_key(dev, &lk_unlk->session.opal_key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	opal_lock_check_for_saved_key(dev, lk_unlk);
 	ret = __opal_lock_unlock(dev, lk_unlk);
@@ -2786,6 +2914,9 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
 	if (!dev)
 		return -ENODEV;
 
+	ret = opal_get_key(dev, opal);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps));
@@ -2808,6 +2939,9 @@ static int opal_activate_lsp(struct opal_dev *dev,
 	if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS)
 		return -EINVAL;
 
+	ret = opal_get_key(dev, &opal_lr_act->key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps));
@@ -2826,6 +2960,9 @@ static int opal_setup_locking_range(struct opal_dev *dev,
 	};
 	int ret;
 
+	ret = opal_get_key(dev, &opal_lrs->session.opal_key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
@@ -2879,6 +3016,14 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 	ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
 	mutex_unlock(&dev->dev_lock);
 
+	if (ret)
+		return ret;
+
+	/* update keyring with new password */
+	ret = update_sed_opal_key(OPAL_AUTH_KEY,
+				  opal_pw->new_user_pw.opal_key.key,
+				  opal_pw->new_user_pw.opal_key.key_len);
+
 	return ret;
 }
 
@@ -2899,6 +3044,9 @@ static int opal_activate_user(struct opal_dev *dev,
 		return -EINVAL;
 	}
 
+	ret = opal_get_key(dev, &opal_session->opal_key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps));
@@ -2985,6 +3133,9 @@ static int opal_generic_read_write_table(struct opal_dev *dev,
 {
 	int ret, bit_set;
 
+	ret = opal_get_key(dev, &rw_tbl->key);
+	if (ret)
+		return ret;
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 
@@ -3053,9 +3204,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 	if (!dev)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	if (!(dev->flags & OPAL_FL_SUPPORTED))
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	if (cmd & IOC_IN) {
 		p = memdup_user(arg, _IOC_SIZE(cmd));
@@ -3137,3 +3288,22 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(sed_ioctl);
+
+static int __init sed_opal_init(void)
+{
+	struct key *kr;
+
+	kr = keyring_alloc(".sed_opal",
+			   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+			   (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW |
+			   KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE,
+			   KEY_ALLOC_NOT_IN_QUOTA,
+			   NULL, NULL);
+	if (IS_ERR(kr))
+		return PTR_ERR(kr);
+
+	sed_opal_keyring = kr;
+
+	return 0;
+}
+late_initcall(sed_opal_init);
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 2f189546e1338..2ac50822554e8 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -25,6 +25,9 @@ bool opal_unlock_from_suspend(struct opal_dev *dev);
 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv);
 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr);
 
+#define	OPAL_AUTH_KEY           "opal-boot-pin"
+#define	OPAL_AUTH_KEY_PREV      "opal-boot-pin-prev"
+
 static inline bool is_sed_ioctl(unsigned int cmd)
 {
 	switch (cmd) {
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 4e10675751b48..d3994b7716bc6 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -49,10 +49,16 @@ enum opal_lock_flags {
 	OPAL_SAVE_FOR_LOCK = 0x01,
 };
 
+enum opal_key_type {
+	OPAL_INCLUDED = 0,	/* key[] is the key */
+	OPAL_KEYRING,		/* key is in keyring */
+};
+
 struct opal_key {
 	__u8 lr;
 	__u8 key_len;
-	__u8 __align[6];
+	__u8 key_type;
+	__u8 __align[5];
 	__u8 key[OPAL_KEY_MAX];
 };
 

From 146afeb235ccec10c17ad8ea26327c0c79dbd968 Mon Sep 17 00:00:00 2001
From: Xu Panda <xu.panda@zte.com.cn>
Date: Sat, 3 Dec 2022 14:22:58 +0800
Subject: [PATCH 113/113] block: use strscpy() to instead of strncpy()

The implementation of strscpy() is more robust and safer.
That's now the recommended way to copy NUL terminated strings.

Signed-off-by: Xu Panda <xu.panda@zte.com.cn>
Signed-off-by: Yang Yang <yang.yang29@zte.com>
Reviewed-by: Justin Stitt <justinstitt@google.com>
Link: https://lore.kernel.org/r/202212031422587503771@zte.com.cn
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partitions/cmdline.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
index 1af610f0ba8c6..c03bc105e5753 100644
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -81,8 +81,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
 
 		length = min_t(int, next - partdef,
 			       sizeof(new_subpart->name) - 1);
-		strncpy(new_subpart->name, partdef, length);
-		new_subpart->name[length] = '\0';
+		strscpy(new_subpart->name, partdef, length);
 
 		partdef = ++next;
 	} else
@@ -140,8 +139,7 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
 	}
 
 	length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
-	strncpy(newparts->name, bdevdef, length);
-	newparts->name[length] = '\0';
+	strscpy(newparts->name, bdevdef, length);
 	newparts->nr_subparts = 0;
 
 	next_subpart = &newparts->subpart;
@@ -153,8 +151,7 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
 		length = (!next) ? (sizeof(buf) - 1) :
 			min_t(int, next - bdevdef, sizeof(buf) - 1);
 
-		strncpy(buf, bdevdef, length);
-		buf[length] = '\0';
+		strscpy(buf, bdevdef, length);
 
 		ret = parse_subpart(next_subpart, buf);
 		if (ret)
@@ -267,8 +264,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart,
 
 	label_min = min_t(int, sizeof(info->volname) - 1,
 			  sizeof(subpart->name));
-	strncpy(info->volname, subpart->name, label_min);
-	info->volname[label_min] = '\0';
+	strscpy(info->volname, subpart->name, label_min);
 
 	snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
 	strlcat(state->pp_buf, tmp, PAGE_SIZE);