From 222f58ac68cd6f98a52faf9f26a219c996a5aa17 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Mon, 3 Jul 2023 17:21:58 +0000 Subject: [PATCH 001/113] kyber: Replace strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20230703172159.3668349-2-azeemshaikh38@gmail.com Signed-off-by: Jens Axboe --- include/trace/events/kyber.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h index bf7533f171ff9..9d44781efc1c4 100644 --- a/include/trace/events/kyber.h +++ b/include/trace/events/kyber.h @@ -31,8 +31,8 @@ TRACE_EVENT(kyber_latency, TP_fast_assign( __entry->dev = dev; - strlcpy(__entry->domain, domain, sizeof(__entry->domain)); - strlcpy(__entry->type, type, sizeof(__entry->type)); + strscpy(__entry->domain, domain, sizeof(__entry->domain)); + strscpy(__entry->type, type, sizeof(__entry->type)); __entry->percentile = percentile; __entry->numerator = numerator; __entry->denominator = denominator; @@ -59,7 +59,7 @@ TRACE_EVENT(kyber_adjust, TP_fast_assign( __entry->dev = dev; - strlcpy(__entry->domain, domain, sizeof(__entry->domain)); + strscpy(__entry->domain, domain, sizeof(__entry->domain)); __entry->depth = depth; ), @@ -81,7 +81,7 @@ TRACE_EVENT(kyber_throttled, TP_fast_assign( __entry->dev = dev; - strlcpy(__entry->domain, domain, sizeof(__entry->domain)); + strscpy(__entry->domain, domain, sizeof(__entry->domain)); ), TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), From 16291561e125ef50cb1875227fce9c1583f0de3c Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Mon, 3 Jul 2023 17:21:59 +0000 Subject: [PATCH 002/113] blk-wbt: Replace strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20230703172159.3668349-3-azeemshaikh38@gmail.com Signed-off-by: Jens Axboe --- include/trace/events/wbt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h index 9c66e59d859cb..4661f0d27062d 100644 --- a/include/trace/events/wbt.h +++ b/include/trace/events/wbt.h @@ -33,7 +33,7 @@ TRACE_EVENT(wbt_stat, ), TP_fast_assign( - strlcpy(__entry->name, bdi_dev_name(bdi), + strscpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->rmean = stat[0].mean; __entry->rmin = stat[0].min; @@ -68,7 +68,7 @@ TRACE_EVENT(wbt_lat, ), TP_fast_assign( - strlcpy(__entry->name, bdi_dev_name(bdi), + strscpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->lat = div_u64(lat, 1000); ), @@ -105,7 +105,7 @@ TRACE_EVENT(wbt_step, ), TP_fast_assign( - strlcpy(__entry->name, bdi_dev_name(bdi), + strscpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->msg = msg; __entry->step = step; @@ -141,7 +141,7 @@ TRACE_EVENT(wbt_timer, ), TP_fast_assign( - strlcpy(__entry->name, bdi_dev_name(bdi), + strscpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->status = status; __entry->step = step; From ffe357c868e7796f20bc0eac61f5b952731c0fa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 13 Jul 2023 21:29:35 +0200 Subject: [PATCH 003/113] nbd: automatically load module on genl access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a module alias to nbd.ko that allows the generic netlink core to automatically load the module when netlink messages for nbd are received. This frees the user from manually having to load the module before using nbd functionality via netlink. If the system policy allows it this can even be used to load the nbd module from containers which would otherwise not have access to the necessary module files to do a normal "modprobe nbd". For example this avoids the following error when using nbd-client: $ nbd-client localhost 10809 /dev/nbd0 ... Error: Couldn't resolve the nbd netlink family, make sure the nbd module is loaded and your nbd driver supports the netlink interface. Signed-off-by: Thomas Weißschuh Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20230713-b4-nbd-genl-v3-1-226cbddba04b@weissschuh.net Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 8576d696c7a22..a346dbd73543f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -2336,6 +2336,7 @@ static struct genl_family nbd_genl_family __ro_after_init = { .mcgrps = nbd_mcast_grps, .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), }; +MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME); static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply) { From c4e21bcd0f9d01f9c5d6c52007f5541871a5b1de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Jul 2023 11:42:38 +0200 Subject: [PATCH 004/113] block: cleanup queue_wc_store Get rid of the local queue_wc_store variable and handling setting and clearing the QUEUE_FLAG_WC flag diretly instead the if / else if. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230707094239.107968-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index afc797fb0dfc4..0cde6598fb2f4 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -449,21 +449,13 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page) static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { - int set = -1; - if (!strncmp(page, "write back", 10)) - set = 1; + blk_queue_flag_set(QUEUE_FLAG_WC, q); else if (!strncmp(page, "write through", 13) || !strncmp(page, "none", 4)) - set = 0; - - if (set == -1) - return -EINVAL; - - if (set) - blk_queue_flag_set(QUEUE_FLAG_WC, q); - else blk_queue_flag_clear(QUEUE_FLAG_WC, q); + else + return -EINVAL; return count; } From 43c9835b144c7ce29efe142d662529662a9eb376 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Jul 2023 11:42:39 +0200 Subject: [PATCH 005/113] block: don't allow enabling a cache on devices that don't support it Currently the write_cache attribute allows enabling the QUEUE_FLAG_WC flag on devices that never claimed the capability. Fix that by adding a QUEUE_FLAG_HW_WC flag that is set by blk_queue_write_cache and guards re-enabling the cache through sysfs. Note that any rescan that calls blk_queue_write_cache will still re-enable the write cache as in the current code. Fixes: 93e9d8e836cb ("block: add ability to flag write back caching on a device") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230707094239.107968-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 7 +++++-- block/blk-sysfs.c | 11 +++++++---- include/linux/blkdev.h | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 4dd59059b788e..0046b447268f9 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -830,10 +830,13 @@ EXPORT_SYMBOL(blk_set_queue_depth); */ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { - if (wc) + if (wc) { + blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q); - else + } else { + blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); blk_queue_flag_clear(QUEUE_FLAG_WC, q); + } if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0cde6598fb2f4..63e4812623361 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -449,13 +449,16 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page) static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { - if (!strncmp(page, "write back", 10)) + if (!strncmp(page, "write back", 10)) { + if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) + return -EINVAL; blk_queue_flag_set(QUEUE_FLAG_WC, q); - else if (!strncmp(page, "write through", 13) || - !strncmp(page, "none", 4)) + } else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) { blk_queue_flag_clear(QUEUE_FLAG_WC, q); - else + } else { return -EINVAL; + } return count; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ed44a997f629f..2f5371b8482c0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -538,6 +538,7 @@ struct request_queue { #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ +#define QUEUE_FLAG_HW_WC 18 /* Write back caching supported */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ From 660e802c76c89e871c29cd3174c07c8d23e39c35 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:55 +0800 Subject: [PATCH 006/113] blk-mq: use percpu csd to remote complete instead of per-rq csd If request need to be completed remotely, we insert it into percpu llist, and smp_call_function_single_async() if llist is empty previously. We don't need to use per-rq csd, percpu csd is enough. And the size of struct request is decreased by 24 bytes. This way is cleaner, and looks correct, given block softirq is guaranteed to be scheduled to consume the list if one new request is added to this percpu list, either smp_call_function_single_async() returns -EBUSY or 0. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230717040058.3993930-2-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++------ include/linux/blk-mq.h | 5 +---- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d50b1d62a3d92..d986548696151 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -43,6 +43,7 @@ #include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); +static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, @@ -1157,15 +1158,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) static void blk_mq_complete_send_ipi(struct request *rq) { - struct llist_head *list; unsigned int cpu; cpu = rq->mq_ctx->cpu; - list = &per_cpu(blk_cpu_done, cpu); - if (llist_add(&rq->ipi_list, list)) { - INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); - smp_call_function_single_async(cpu, &rq->csd); - } + if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) + smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); } static void blk_mq_raise_softirq(struct request *rq) @@ -4829,6 +4826,9 @@ static int __init blk_mq_init(void) for_each_possible_cpu(i) init_llist_head(&per_cpu(blk_cpu_done, i)); + for_each_possible_cpu(i) + INIT_CSD(&per_cpu(blk_cpu_csd, i), + __blk_mq_complete_request_remote, NULL); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b96e00499f9ee..67f8108576340 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -182,10 +182,7 @@ struct request { rq_end_io_fn *saved_end_io; } flush; - union { - struct __call_single_data csd; - u64 fifo_time; - }; + u64 fifo_time; /* * completion callback. From 28b241237470981a96fbd82077c8044466b61e5f Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:56 +0800 Subject: [PATCH 007/113] blk-flush: fix rq->flush.seq for post-flush requests If the policy == (REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH), it means that the data sequence and post-flush sequence need to be done for this request. The rq->flush.seq should record what sequences have been done (or don't need to be done). So in this case, pre-flush doesn't need to be done, we should init rq->flush.seq to REQ_FSEQ_PREFLUSH not REQ_FSEQ_POSTFLUSH. Fixes: 615939a2ae73 ("blk-mq: defer to the normal submission path for post-flush requests") Signed-off-by: Chengming Zhou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230717040058.3993930-3-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 8220517c2d67d..fdc489e0ea162 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -443,7 +443,7 @@ bool blk_insert_flush(struct request *rq) * the post flush, and then just pass the command on. */ blk_rq_init_flush(rq); - rq->flush.seq |= REQ_FSEQ_POSTFLUSH; + rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); spin_unlock_irq(&fq->mq_flush_lock); From b175c86739d38e41044d3136065f092a6d95aee6 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:57 +0800 Subject: [PATCH 008/113] blk-flush: count inflight flush_data requests The flush state machine use a double list to link all inflight flush_data requests, to avoid issuing separate post-flushes for these flush_data requests which shared PREFLUSH. So we can't reuse rq->queuelist, this is why we need rq->flush.list In preparation of the next patch that reuse rq->queuelist for flush state machine, we change the double linked list to unsigned long counter, which count all inflight flush_data requests. This is ok since we only need to know if there is any inflight flush_data request, so unsigned long counter is good. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230717040058.3993930-4-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 9 +++++---- block/blk.h | 5 ++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index fdc489e0ea162..fedb39031647e 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -187,7 +187,8 @@ static void blk_flush_complete_seq(struct request *rq, break; case REQ_FSEQ_DATA: - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + list_del_init(&rq->flush.list); + fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); list_add(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); @@ -299,7 +300,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, return; /* C2 and C3 */ - if (!list_empty(&fq->flush_data_in_flight) && + if (fq->flush_data_in_flight && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; @@ -374,6 +375,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + fq->flush_data_in_flight--; blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); @@ -445,7 +447,7 @@ bool blk_insert_flush(struct request *rq) blk_rq_init_flush(rq); rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + fq->flush_data_in_flight++; spin_unlock_irq(&fq->mq_flush_lock); return false; default: @@ -496,7 +498,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); - INIT_LIST_HEAD(&fq->flush_data_in_flight); return fq; diff --git a/block/blk.h b/block/blk.h index 608c5dcc516b5..686712e138352 100644 --- a/block/blk.h +++ b/block/blk.h @@ -15,15 +15,14 @@ struct elevator_type; extern struct dentry *blk_debugfs_root; struct blk_flush_queue { + spinlock_t mq_flush_lock; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; - struct list_head flush_data_in_flight; + unsigned long flush_data_in_flight; struct request *flush_rq; - - spinlock_t mq_flush_lock; }; bool is_flush_rq(struct request *req); From 81ada09cc25e4bf2de7d2951925fb409338a545d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:58 +0800 Subject: [PATCH 009/113] blk-flush: reuse rq queuelist in flush state machine Since we don't need to maintain inflight flush_data requests list anymore, we can reuse rq->queuelist for flush pending list. Note in mq_flush_data_end_io(), we need to re-initialize rq->queuelist before reusing it in the state machine when end, since the rq->rq_next also reuse it, may have corrupted rq->queuelist by the driver. This patch decrease the size of struct request by 16 bytes. Signed-off-by: Chengming Zhou Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230717040058.3993930-5-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 17 ++++++++++------- include/linux/blk-mq.h | 1 - 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index fedb39031647e..e73dc22d05c1d 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -183,14 +183,13 @@ static void blk_flush_complete_seq(struct request *rq, /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; - list_move_tail(&rq->flush.list, pending); + list_move_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: - list_del_init(&rq->flush.list); fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); - list_add(&rq->queuelist, &q->requeue_list); + list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; @@ -202,7 +201,7 @@ static void blk_flush_complete_seq(struct request *rq, * flush data request completion path. Restore @rq for * normal completion and end it. */ - list_del_init(&rq->flush.list); + list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; @@ -258,7 +257,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ - list_for_each_entry_safe(rq, n, running, flush.list) { + list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); @@ -292,7 +291,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = - list_first_entry(pending, struct request, flush.list); + list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ @@ -376,6 +375,11 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, */ spin_lock_irqsave(&fq->mq_flush_lock, flags); fq->flush_data_in_flight--; + /* + * May have been corrupted by rq->rq_next reuse, we need to + * re-initialize rq->queuelist before reusing it here. + */ + INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); @@ -386,7 +390,6 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, static void blk_rq_init_flush(struct request *rq) { rq->flush.seq = 0; - INIT_LIST_HEAD(&rq->flush.list); rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = mq_flush_data_end_io; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 67f8108576340..01e8c31db6658 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -178,7 +178,6 @@ struct request { struct { unsigned int seq; - struct list_head list; rq_end_io_fn *saved_end_io; } flush; From 8f63fef5867fb5e8c29d9c14b6d739bfc1869d32 Mon Sep 17 00:00:00 2001 From: Nitesh Shetty Date: Wed, 19 Jul 2023 17:46:08 +0530 Subject: [PATCH 010/113] block: refactor to use helper Reduce some code by making use of bio_integrity_bytes(). Signed-off-by: Nitesh Shetty Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230719121608.32105-1-nj.shetty@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4533eb4916610..8f0af7ac8573b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -199,7 +199,6 @@ bool bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; - unsigned int intervals; blk_status_t status; if (!bi) @@ -224,10 +223,9 @@ bool bio_integrity_prep(struct bio *bio) !(bi->flags & BLK_INTEGRITY_GENERATE)) return true; } - intervals = bio_integrity_intervals(bi, bio_sectors(bio)); /* Allocate kernel buffer for protection data */ - len = intervals * bi->tuple_size; + len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, GFP_NOIO); status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { From cd1d83e24e689f25de7e34bea697971750138d5f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:26 -0700 Subject: [PATCH 011/113] block: tidy up the bio full checks in bio_add_hw_page bio_add_hw_page already checks if the number of bytes trying to be added even fit into max_hw_sectors limit of the queue. Remove the call to bio_full and just do a check for the smaller of the number of segments in the bio and the queue max segments limit, and do this cheap check before the more expensive gap to previous check. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/block/bio.c b/block/bio.c index 8672179213b93..72488ecea47ac 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1014,6 +1014,10 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) return len; + if (bio->bi_vcnt >= + min(bio->bi_max_vecs, queue_max_segments(q))) + return 0; + /* * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. @@ -1023,12 +1027,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; } - if (bio_full(bio, len)) - return 0; - - if (bio->bi_vcnt >= queue_max_segments(q)) - return 0; - bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); bio->bi_vcnt++; bio->bi_iter.bi_size += len; From 6850b2dd5c25f27f7b74414553f047d4c12dd66c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:27 -0700 Subject: [PATCH 012/113] block: use SECTOR_SHIFT bio_add_hw_page Use the SECTOR_SHIFT magic constant instead of the magic number. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 72488ecea47ac..445be4bdd99bd 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1007,7 +1007,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) + if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors) return 0; if (bio->bi_vcnt > 0) { From 939e1a370330841b2c0292a483d7b38f3ee45f88 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:28 -0700 Subject: [PATCH 013/113] block: move the BIO_CLONED checks out of __bio_try_merge_page __bio_try_merge_page is a way too low-level helper to assert that the bio is not cloned. Move the check into bio_add_page and bio_iov_iter_get_pages instead, which are the high level entry points that should enforce this variant. bio_add_hw_page already this check, coverig the third (indirect) caller of __bio_try_merge_page. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/block/bio.c b/block/bio.c index 445be4bdd99bd..3ac72e60e7f11 100644 --- a/block/bio.c +++ b/block/bio.c @@ -945,9 +945,6 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, static bool __bio_try_merge_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off, bool *same_page) { - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return false; - if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; @@ -1127,6 +1124,9 @@ int bio_add_page(struct bio *bio, struct page *page, { bool same_page = false; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { if (bio_full(bio, len)) return 0; @@ -1335,6 +1335,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { int ret = 0; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EIO; + if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); From 0eca8b6f97ac705c5806f7d062207379094fb114 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:29 -0700 Subject: [PATCH 014/113] block: move the bi_vcnt check out of __bio_try_merge_page Move the bi_vcnt out of __bio_try_merge_page and into the two callers that don't already have it in preparation for additional changes to __bio_try_merge_page. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/block/bio.c b/block/bio.c index 3ac72e60e7f11..3dcbe98580dce 100644 --- a/block/bio.c +++ b/block/bio.c @@ -945,20 +945,17 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, static bool __bio_try_merge_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off, bool *same_page) { - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (!page_is_mergeable(bv, page, len, off, same_page)) + return false; + if (bio->bi_iter.bi_size > UINT_MAX - len) { + *same_page = false; + return false; } - return false; + bv->bv_len += len; + bio->bi_iter.bi_size += len; + return true; } /* @@ -1127,11 +1124,13 @@ int bio_add_page(struct bio *bio, struct page *page, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (bio_full(bio, len)) - return 0; - __bio_add_page(bio, page, len, offset); - } + if (bio->bi_vcnt > 0 && + __bio_try_merge_page(bio, page, len, offset, &same_page)) + return len; + + if (bio_full(bio, len)) + return 0; + __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); @@ -1205,13 +1204,13 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, { bool same_page = false; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - __bio_add_page(bio, page, len, offset); + if (bio->bi_vcnt > 0 && + __bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (same_page) + bio_release_page(bio, page); return 0; } - - if (same_page) - bio_release_page(bio, page); + __bio_add_page(bio, page, len, offset); return 0; } From 613699050a49760f1d70c74f71bd0b013ca3c356 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:30 -0700 Subject: [PATCH 015/113] block: move the bi_size overflow check in __bio_try_merge_page Checking for availability in bi_size in a function that attempts to merge into an existing segment is a bit odd, as the limit also applies when adding a new segment. This code works fine as we always call __bio_try_merge_page, but contributes to sub-optimal calling conventions and doesn't lead to clear code. Move it to two of the callers instead, the third one already has a more strict check that includes max_hw_segments anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/bio.c b/block/bio.c index 3dcbe98580dce..17f57fd2cff2f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -949,10 +949,6 @@ static bool __bio_try_merge_page(struct bio *bio, struct page *page, if (!page_is_mergeable(bv, page, len, off, same_page)) return false; - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } bv->bv_len += len; bio->bi_iter.bi_size += len; return true; @@ -1123,6 +1119,8 @@ int bio_add_page(struct bio *bio, struct page *page, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return 0; if (bio->bi_vcnt > 0 && __bio_try_merge_page(bio, page, len, offset, &same_page)) @@ -1204,6 +1202,9 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, { bool same_page = false; + if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) + return -EIO; + if (bio->bi_vcnt > 0 && __bio_try_merge_page(bio, page, len, offset, &same_page)) { if (same_page) From 80232b520314214d846eb0a65faef8b51b702fa7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:31 -0700 Subject: [PATCH 016/113] block: downgrade a bio_full call in bio_add_page bio_add_page already checks that there is space in bi_size a little earlier. So after we failed to add to an existing segment, just check that there is another one available instead of duplicating the bi_size check. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-7-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 17f57fd2cff2f..d8e0e8de8cf4f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1126,7 +1126,7 @@ int bio_add_page(struct bio *bio, struct page *page, __bio_try_merge_page(bio, page, len, offset, &same_page)) return len; - if (bio_full(bio, len)) + if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; __bio_add_page(bio, page, len, offset); return len; From 858c708d9efb7e8e5c6320793b778cc17cf8368a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:32 -0700 Subject: [PATCH 017/113] block: move the bi_size update out of __bio_try_merge_page The update of bi_size is the only thing in __bio_try_merge_page that needs a bio. Move it to the callers, and merge __bio_try_merge_page and page_is_mergeable into a single bvec_try_merge_page that only takes the current bvec instead of a full bio. This will allow reusing this function for supporting multi-page integrity payload bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-8-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 57 +++++++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/block/bio.c b/block/bio.c index d8e0e8de8cf4f..23b7a001b5005 100644 --- a/block/bio.c +++ b/block/bio.c @@ -903,9 +903,8 @@ static inline bool bio_full(struct bio *bio, unsigned len) return false; } -static inline bool page_is_mergeable(const struct bio_vec *bv, - struct page *page, unsigned int len, unsigned int off, - bool *same_page) +static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, + unsigned int len, unsigned int off, bool *same_page) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; @@ -919,38 +918,14 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return false; *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); - if (*same_page) - return true; - else if (IS_ENABLED(CONFIG_KMSAN)) - return false; - return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); -} - -/** - * __bio_try_merge_page - try appending data to an existing bvec. - * @bio: destination bio - * @page: start page to add - * @len: length of the data to add - * @off: offset of the data relative to @page - * @same_page: return if the segment has been merged inside the same page - * - * Try to add the data at @page + @off to the last bvec of @bio. This is a - * useful optimisation for file systems with a block size smaller than the - * page size. - * - * Warn if (@len, @off) crosses pages in case that @same_page is true. - * - * Return %true on success or %false on failure. - */ -static bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page) -{ - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if (!*same_page) { + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) + return false; + } - if (!page_is_mergeable(bv, page, len, off, same_page)) - return false; bv->bv_len += len; - bio->bi_iter.bi_size += len; return true; } @@ -972,7 +947,7 @@ static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, return false; if (bv->bv_len + len > queue_max_segment_size(q)) return false; - return __bio_try_merge_page(bio, page, len, offset, same_page); + return bvec_try_merge_page(bv, page, len, offset, same_page); } /** @@ -1001,8 +976,11 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) + if (bio_try_merge_hw_seg(q, bio, page, len, offset, + same_page)) { + bio->bi_iter.bi_size += len; return len; + } if (bio->bi_vcnt >= min(bio->bi_max_vecs, queue_max_segments(q))) @@ -1123,8 +1101,11 @@ int bio_add_page(struct bio *bio, struct page *page, return 0; if (bio->bi_vcnt > 0 && - __bio_try_merge_page(bio, page, len, offset, &same_page)) + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; return len; + } if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; @@ -1206,7 +1187,9 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, return -EIO; if (bio->bi_vcnt > 0 && - __bio_try_merge_page(bio, page, len, offset, &same_page)) { + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; if (same_page) bio_release_page(bio, page); return 0; From ae42f0b3bf65912e122fc2e8d5f6d94b51156dba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:33 -0700 Subject: [PATCH 018/113] block: don't pass a bio to bio_try_merge_hw_seg There is no good reason to pass the bio to bio_try_merge_hw_seg. Just pass the current bvec and rename the function to bvec_try_merge_hw_page. This will allow reusing this function for supporting multi-page integrity payload bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-9-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/block/bio.c b/block/bio.c index 23b7a001b5005..c92dda962449b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -934,11 +934,10 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, * size limit. This is not for normal read/write bios, but for passthrough * or Zone Append operations that we can't split. */ -static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, - struct page *page, unsigned len, - unsigned offset, bool *same_page) +static bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; @@ -967,8 +966,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { - struct bio_vec *bvec; - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; @@ -976,7 +973,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, same_page)) { bio->bi_iter.bi_size += len; return len; @@ -990,8 +989,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. */ - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bvec_gap_to_prev(&q->limits, bvec, offset)) + if (bvec_gap_to_prev(&q->limits, bv, offset)) return 0; } From b5ca9acff553874aaf1faf176e076cbd7cc4aa0e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 21 Jul 2023 10:27:28 -0700 Subject: [PATCH 019/113] scsi: Inline scsi_kick_queue() Inline scsi_kick_queue() to prepare for modifying the second argument passed to blk_mq_run_hw_queues(). Reviewed-by: Christoph Hellwig Cc: "Martin K. Petersen" Signed-off-by: Bart Van Assche Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230721172731.955724-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ad9afae49544a..414d29eef9686 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -300,11 +300,6 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd) cmd->budget_token = -1; } -static void scsi_kick_queue(struct request_queue *q) -{ - blk_mq_run_hw_queues(q, false); -} - /* * Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with * interrupts disabled. @@ -340,7 +335,7 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev) * but in most cases, we will be first. Ideally, each LU on the * target would get some limited time or requests on the target. */ - scsi_kick_queue(current_sdev->request_queue); + blk_mq_run_hw_queues(current_sdev->request_queue, false); spin_lock_irqsave(shost->host_lock, flags); if (!starget->starget_sdev_user) @@ -427,7 +422,7 @@ static void scsi_starved_list_run(struct Scsi_Host *shost) continue; spin_unlock_irqrestore(shost->host_lock, flags); - scsi_kick_queue(slq); + blk_mq_run_hw_queues(slq, false); blk_put_queue(slq); spin_lock_irqsave(shost->host_lock, flags); From d42e2e3448a99c41c8489766eeb732d8d741d5be Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 21 Jul 2023 10:27:29 -0700 Subject: [PATCH 020/113] scsi: Remove a blk_mq_run_hw_queues() call blk_mq_kick_requeue_list() calls blk_mq_run_hw_queues() asynchronously. Leave out the direct blk_mq_run_hw_queues() call. This patch causes scsi_run_queue() to call blk_mq_run_hw_queues() asynchronously instead of synchronously. Since scsi_run_queue() is not called from the hot I/O submission path, this patch does not affect the hot path. This patch prepares for allowing blk_mq_run_hw_queue() to sleep if BLK_MQ_F_BLOCKING has been set. scsi_run_queue() may be called from atomic context and must not sleep. Hence the removal of the blk_mq_run_hw_queues(q, false) call. See also scsi_unblock_requests(). Cc: "Martin K. Petersen" Signed-off-by: Bart Van Assche Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230721172731.955724-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 414d29eef9686..d4c514ab9fe87 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -447,8 +447,8 @@ static void scsi_run_queue(struct request_queue *q) if (!list_empty(&sdev->host->starved_list)) scsi_starved_list_run(sdev->host); + /* Note: blk_mq_kick_requeue_list() runs the queue asynchronously. */ blk_mq_kick_requeue_list(q); - blk_mq_run_hw_queues(q, false); } void scsi_requeue_run_queue(struct work_struct *work) From 65a558f66c308251e256317957b75d1e643c33c3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 21 Jul 2023 10:27:30 -0700 Subject: [PATCH 021/113] block: Improve performance for BLK_MQ_F_BLOCKING drivers blk_mq_run_queue() runs the queue asynchronously if BLK_MQ_F_BLOCKING has been set. This is suboptimal since running the queue asynchronously is slower than running the queue synchronously. This patch modifies blk_mq_run_queue() as follows if BLK_MQ_F_BLOCKING has been set: - Run the queue synchronously if it is allowed to sleep. - Run the queue asynchronously if it is not allowed to sleep. Additionally, blk_mq_run_hw_queue(hctx, false) calls are modified into blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING) if the caller may be invoked from atomic context. The following caller chains have been reviewed: blk_mq_run_hw_queue(hctx, false) blk_mq_get_tag() /* may sleep, hence the functions it calls may also sleep */ blk_execute_rq() /* may sleep */ blk_mq_run_hw_queues(q, async=false) blk_freeze_queue_start() /* may sleep */ blk_mq_requeue_work() /* may sleep */ scsi_kick_queue() scsi_requeue_run_queue() /* may sleep */ scsi_run_host_queues() scsi_ioctl_reset() /* may sleep */ blk_mq_insert_requests(hctx, ctx, list, run_queue_async=false) blk_mq_dispatch_plug_list(plug, from_sched=false) blk_mq_flush_plug_list(plug, from_schedule=false) __blk_flush_plug(plug, from_schedule=false) blk_add_rq_to_plug() blk_mq_submit_bio() /* may sleep if REQ_NOWAIT has not been set */ blk_mq_plug_issue_direct() blk_mq_flush_plug_list() /* see above */ blk_mq_dispatch_plug_list(plug, from_sched=false) blk_mq_flush_plug_list() /* see above */ blk_mq_try_issue_directly() blk_mq_submit_bio() /* may sleep if REQ_NOWAIT has not been set */ blk_mq_try_issue_list_directly(hctx, list) blk_mq_insert_requests() /* see above */ Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230721172731.955724-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++------ drivers/scsi/scsi_lib.c | 3 ++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d986548696151..687ec3f4f10d2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1323,7 +1323,7 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) } blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -2222,6 +2222,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) */ WARN_ON_ONCE(!async && in_interrupt()); + might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); + /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue @@ -2237,8 +2239,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) if (!need_run) return; - if (async || (hctx->flags & BLK_MQ_F_BLOCKING) || - !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { + if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); return; } @@ -2373,7 +2374,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -2403,7 +2404,8 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) unsigned long i; queue_for_each_hw_ctx(q, hctx, i) - blk_mq_start_stopped_hw_queue(hctx, async); + blk_mq_start_stopped_hw_queue(hctx, async || + (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); @@ -2461,6 +2463,8 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); trace_block_rq_insert(rq); + if (rq->cmd_flags & REQ_NOWAIT) + run_queue_async = true; } spin_lock(&ctx->lock); @@ -2621,7 +2625,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); return; } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d4c514ab9fe87..59176946ab560 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -335,7 +335,8 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev) * but in most cases, we will be first. Ideally, each LU on the * target would get some limited time or requests on the target. */ - blk_mq_run_hw_queues(current_sdev->request_queue, false); + blk_mq_run_hw_queues(current_sdev->request_queue, + shost->queuecommand_may_block); spin_lock_irqsave(shost->host_lock, flags); if (!starget->starget_sdev_user) From 51d74ec9b62f5813767a60226acaf943e26e7d7a Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Tue, 25 Jul 2023 14:18:39 +0900 Subject: [PATCH 022/113] block: cleanup bio_integrity_prep If a problem occurs in the process of creating an integrity payload, the status of bio is always BLK_STS_RESOURCE. Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230725051839epcms2p8e4d20ad6c51326ad032e8406f59d0aaa@epcms2p8 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 8f0af7ac8573b..045553a164e0c 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -199,7 +199,6 @@ bool bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; - blk_status_t status; if (!bi) return true; @@ -227,7 +226,6 @@ bool bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, GFP_NOIO); - status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; @@ -242,7 +240,6 @@ bool bio_integrity_prep(struct bio *bio) if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - status = BLK_STS_RESOURCE; goto err_end_io; } @@ -270,7 +267,6 @@ bool bio_integrity_prep(struct bio *bio) if (ret == 0) { printk(KERN_ERR "could not attach integrity payload\n"); - status = BLK_STS_RESOURCE; goto err_end_io; } @@ -292,7 +288,7 @@ bool bio_integrity_prep(struct bio *bio) return true; err_end_io: - bio->bi_status = status; + bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; From a865b96c513bcaeec49669010d67c40aa8e58619 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:20:32 +0800 Subject: [PATCH 023/113] Revert "md: unlock mddev before reap sync_thread in action_store" This reverts commit 9dfbdafda3b34e262e43e786077bab8e476a89d1. Because it will introduce a defect that sync_thread can be running while MD_RECOVERY_RUNNING is cleared, which will cause some unexpected problems, for example: list_add corruption. prev->next should be next (ffff0001ac1daba0), but was ffff0000ce1a02a0. (prev=ffff0000ce1a02a0). Call trace: __list_add_valid+0xfc/0x140 insert_work+0x78/0x1a0 __queue_work+0x500/0xcf4 queue_work_on+0xe8/0x12c md_check_recovery+0xa34/0xf30 raid10d+0xb8/0x900 [raid10] md_thread+0x16c/0x2cc kthread+0x1a4/0x1ec ret_from_fork+0x10/0x18 This is because work is requeued while it's still inside workqueue: t1: t2: action_store mddev_lock if (mddev->sync_thread) mddev_unlock md_unregister_thread // first sync_thread is done md_check_recovery mddev_try_lock /* * once MD_RECOVERY_DONE is set, new sync_thread * can start. */ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery) INIT_WORK(&mddev->del_work, md_start_sync) queue_work(md_misc_wq, &mddev->del_work) test_and_set_bit(WORK_STRUCT_PENDING_BIT, ...) // set pending bit insert_work list_add_tail mddev_unlock mddev_lock_nointr md_reap_sync_thread // MD_RECOVERY_RUNNING is cleared mddev_unlock t3: // before queued work started from t2 md_check_recovery // MD_RECOVERY_RUNNING is not set, a new sync_thread can be started INIT_WORK(&mddev->del_work, md_start_sync) work->data = 0 // work pending bit is cleared queue_work(md_misc_wq, &mddev->del_work) insert_work list_add_tail // list is corrupted The above commit is reverted to fix the problem, the deadlock this commit tries to fix will be fixed in following patches. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529132037.2124527-2-yukuai1@huaweicloud.com --- drivers/md/dm-raid.c | 1 - drivers/md/md.c | 19 ++----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 8846bf510a35e..1f22bef278418 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3725,7 +3725,6 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2e38ef421d69f..d445d5fb7a01c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4764,19 +4764,6 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { - sector_t save_rp = mddev->reshape_position; - - mddev_unlock(mddev); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); - mddev_lock_nointr(mddev); - /* - * set RECOVERY_INTR again and restore reshape - * position in case others changed them after - * got lock, eg, reshape_position_store and - * md_check_recovery. - */ - mddev->reshape_position = save_rp; set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } @@ -6176,7 +6163,6 @@ static void __md_stop_writes(struct mddev *mddev) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } @@ -9327,7 +9313,6 @@ void md_check_recovery(struct mddev *mddev) * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -9363,7 +9348,6 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } if (mddev->sync_thread) { - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); goto unlock; } @@ -9443,7 +9427,8 @@ void md_reap_sync_thread(struct mddev *mddev) sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; - /* sync_thread should be unregistered, collect result */ + /* resync has finished, collect result */ + md_unregister_thread(&mddev->sync_thread); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { From 64e5e09afc14f8cc9058b0ed5c9cc4c8cd126b85 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:20:33 +0800 Subject: [PATCH 024/113] md: refactor action_store() for 'idle' and 'frozen' Prepare to handle 'idle' and 'frozen' differently to fix a deadlock, there are no functional changes except that MD_RECOVERY_RUNNING is checked again after 'reconfig_mutex' is held. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529132037.2124527-3-yukuai1@huaweicloud.com --- drivers/md/md.c | 61 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d445d5fb7a01c..7fa91f0e56202 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4747,6 +4747,46 @@ action_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", type); } +static void stop_sync_thread(struct mddev *mddev) +{ + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return; + + if (mddev_lock(mddev)) + return; + + /* + * Check again in case MD_RECOVERY_RUNNING is cleared before lock is + * held. + */ + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + mddev_unlock(mddev); + return; + } + + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); + + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); + } + + mddev_unlock(mddev); +} + +static void idle_sync_thread(struct mddev *mddev) +{ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev); +} + +static void frozen_sync_thread(struct mddev *mddev) +{ + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev); +} + static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { @@ -4754,22 +4794,11 @@ action_store(struct mddev *mddev, const char *page, size_t len) return -EINVAL; - if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { - if (cmd_match(page, "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - mddev_lock(mddev) == 0) { - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } - mddev_unlock(mddev); - } - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (cmd_match(page, "idle")) + idle_sync_thread(mddev); + else if (cmd_match(page, "frozen")) + frozen_sync_thread(mddev); + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; else if (cmd_match(page, "resync")) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); From 6f56f0c4f1241f1694a6a9438dd4f78d4513a917 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:20:34 +0800 Subject: [PATCH 025/113] md: add a mutex to synchronize idle and frozen in action_store() Currently, for idle and frozen, action_store will hold 'reconfig_mutex' and call md_reap_sync_thread() to stop sync thread, however, this will cause deadlock (explained in the next patch). In order to fix the problem, following patch will release 'reconfig_mutex' and wait on 'resync_wait', like md_set_readonly() and do_md_stop() does. Consider that action_store() will set/clear 'MD_RECOVERY_FROZEN' unconditionally, which might cause unexpected problems, for example, frozen just set 'MD_RECOVERY_FROZEN' and is still in progress, while 'idle' clear 'MD_RECOVERY_FROZEN' and new sync thread is started, which might starve in progress frozen. A mutex is added to synchronize idle and frozen from action_store(). Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529132037.2124527-4-yukuai1@huaweicloud.com --- drivers/md/md.c | 5 +++++ drivers/md/md.h | 3 +++ 2 files changed, 8 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7fa91f0e56202..3d7e87cab8ad6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -643,6 +643,7 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->sync_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); @@ -4777,14 +4778,18 @@ static void stop_sync_thread(struct mddev *mddev) static void idle_sync_thread(struct mddev *mddev) { + mutex_lock(&mddev->sync_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev); + mutex_unlock(&mddev->sync_mutex); } static void frozen_sync_thread(struct mddev *mddev) { + mutex_lock(&mddev->sync_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev); + mutex_unlock(&mddev->sync_mutex); } static ssize_t diff --git a/drivers/md/md.h b/drivers/md/md.h index 1aef86bf3fc31..18c168bf5fabe 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -535,6 +535,9 @@ struct mddev { */ struct list_head deleting; + /* Used to synchronize idle and frozen for action_store() */ + struct mutex sync_mutex; + bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; From 130443d60b1b8c7a609a2af3384dd8e60df97181 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:20:35 +0800 Subject: [PATCH 026/113] md: refactor idle/frozen_sync_thread() to fix deadlock Our test found a following deadlock in raid10: 1) Issue a normal write, and such write failed: raid10_end_write_request set_bit(R10BIO_WriteError, &r10_bio->state) one_write_done reschedule_retry // later from md thread raid10d handle_write_completed list_add(&r10_bio->retry_list, &conf->bio_end_io_list) // later from md thread raid10d if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) list_move(conf->bio_end_io_list.prev, &tmp) r10_bio = list_first_entry(&tmp, struct r10bio, retry_list) raid_end_bio_io(r10_bio) Dependency chain 1: normal io is waiting for updating superblock 2) Trigger a recovery: raid10_sync_request raise_barrier Dependency chain 2: sync thread is waiting for normal io 3) echo idle/frozen to sync_action: action_store mddev_lock md_unregister_thread kthread_stop Dependency chain 3: drop 'reconfig_mutex' is waiting for sync thread 4) md thread can't update superblock: raid10d md_check_recovery if (mddev_trylock(mddev)) md_update_sb Dependency chain 4: update superblock is waiting for 'reconfig_mutex' Hence cyclic dependency exist, in order to fix the problem, we must break one of them. Dependency 1 and 2 can't be broken because they are foundation design. Dependency 4 may be possible if it can be guaranteed that no io can be inflight, however, this requires a new mechanism which seems complex. Dependency 3 is a good choice, because idle/frozen only requires sync thread to finish, which can be done asynchronously that is already implemented, and 'reconfig_mutex' is not needed anymore. This patch switch 'idle' and 'frozen' to wait sync thread to be done asynchronously, and this patch also add a sequence counter to record how many times sync thread is done, so that 'idle' won't keep waiting on new started sync thread. Noted that raid456 has similiar deadlock([1]), and it's verified[2] this deadlock can be fixed by this patch as well. [1] https://lore.kernel.org/linux-raid/5ed54ffc-ce82-bf66-4eff-390cb23bc1ac@molgen.mpg.de/T/#t [2] https://lore.kernel.org/linux-raid/e9067438-d713-f5f3-0d3d-9e6b0e9efa0e@huaweicloud.com/ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529132037.2124527-5-yukuai1@huaweicloud.com --- drivers/md/md.c | 23 +++++++++++++++++++---- drivers/md/md.h | 2 ++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3d7e87cab8ad6..920701ab95054 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -651,6 +651,7 @@ void mddev_init(struct mddev *mddev) timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); + atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); @@ -4768,19 +4769,27 @@ static void stop_sync_thread(struct mddev *mddev) if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); mddev_unlock(mddev); } static void idle_sync_thread(struct mddev *mddev) { + int sync_seq = atomic_read(&mddev->sync_seq); + mutex_lock(&mddev->sync_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev); + + wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + mutex_unlock(&mddev->sync_mutex); } @@ -4789,6 +4798,10 @@ static void frozen_sync_thread(struct mddev *mddev) mutex_lock(&mddev->sync_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev); + + wait_event(resync_wait, mddev->sync_thread == NULL && + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + mutex_unlock(&mddev->sync_mutex); } @@ -9463,6 +9476,8 @@ void md_reap_sync_thread(struct mddev *mddev) /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); + atomic_inc(&mddev->sync_seq); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 18c168bf5fabe..914e6ece9af29 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -537,6 +537,8 @@ struct mddev { /* Used to synchronize idle and frozen for action_store() */ struct mutex sync_mutex; + /* The sequence number for sync thread */ + atomic_t sync_seq; bool has_superblocks:1; bool fail_last_dev:1; From 753260ed0b46d2ba0d3d6f68a6a49187bff443e4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:20:36 +0800 Subject: [PATCH 027/113] md: wake up 'resync_wait' at last in md_reap_sync_thread() md_reap_sync_thread() is just replaced with wait_event(resync_wait, ...) from action_store(), just make sure action_store() will still wait for everything to be done in md_reap_sync_thread(). Signed-off-by: Yu Kuai Reviewd-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529132037.2124527-6-yukuai1@huaweicloud.com --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 920701ab95054..e0d8e751a7828 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9522,7 +9522,6 @@ void md_reap_sync_thread(struct mddev *mddev) if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) md_cluster_ops->update_size(mddev, old_dev_sectors); - wake_up(&resync_wait); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9530,6 +9529,7 @@ void md_reap_sync_thread(struct mddev *mddev) md_new_event(); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); + wake_up(&resync_wait); } EXPORT_SYMBOL(md_reap_sync_thread); From f71209b1f21c838a973d858d9f6f76cd39227733 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:20:37 +0800 Subject: [PATCH 028/113] md: enhance checking in md_check_recovery() For md_check_recovery(): 1) if 'MD_RECOVERY_RUNING' is not set, register new sync_thread. 2) if 'MD_RECOVERY_RUNING' is set: a) if 'MD_RECOVERY_DONE' is not set, don't do anything, wait for md_do_sync() to be done. b) if 'MD_RECOVERY_DONE' is set, unregister sync_thread. Current code expects that sync_thread is not NULL, otherwise new sync_thread will be registered, which will corrupt the array. Make sure md_check_recovery() won't register new sync_thread if 'MD_RECOVERY_RUNING' is still set, and a new WARN_ON_ONCE() is added for the above corruption, Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529132037.2124527-7-yukuai1@huaweicloud.com --- drivers/md/md.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e0d8e751a7828..320d71537359a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9388,16 +9388,24 @@ void md_check_recovery(struct mddev *mddev) if (mddev->sb_flags) md_update_sb(mddev, 0); - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { - /* resync/recovery still happening */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - goto unlock; - } - if (mddev->sync_thread) { + /* + * Never start a new sync thread if MD_RECOVERY_RUNNING is + * still set. + */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { + /* resync/recovery still happening */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + goto unlock; + } + + if (WARN_ON_ONCE(!mddev->sync_thread)) + goto unlock; + md_reap_sync_thread(mddev); goto unlock; } + /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ From 59cefee75bda5d4cc14f4a1ca861b69091e22c3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:30 +0200 Subject: [PATCH 029/113] md-bitmap: set BITMAP_WRITE_ERROR in write_sb_page Set BITMAP_WRITE_ERROR directly in write_sb_page instead of propagating the error to the caller and setting it there. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-2-hch@lst.de --- drivers/md/md-bitmap.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 1ff712889a3b3..d8469720fac23 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -279,22 +279,20 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return 0; } -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) +static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { - struct md_rdev *rdev; struct mddev *mddev = bitmap->mddev; - int ret; do { - rdev = NULL; + struct md_rdev *rdev = NULL; + while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - ret = __write_sb_page(rdev, bitmap, page); - if (ret) - return ret; + if (__write_sb_page(rdev, bitmap, page) < 0) { + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); + return; + } } } while (wait && md_super_wait(mddev) < 0); - - return 0; } static void md_bitmap_file_kick(struct bitmap *bitmap); @@ -306,10 +304,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) struct buffer_head *bh; if (bitmap->storage.file == NULL) { - switch (write_sb_page(bitmap, page, wait)) { - case -EINVAL: - set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); - } + write_sb_page(bitmap, page, wait); } else { bh = page_buffers(page); From 546ac0b2e2b15d0af7e6d10506558dded1d9d54a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:31 +0200 Subject: [PATCH 030/113] md-bitmap: initialize variables at declaration time in md_bitmap_file_unmap Just a small tidyup to prepare for bigger changes. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-3-hch@lst.de --- drivers/md/md-bitmap.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index d8469720fac23..0b2d8933cbc75 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -842,14 +842,10 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store, static void md_bitmap_file_unmap(struct bitmap_storage *store) { - struct page **map, *sb_page; - int pages; - struct file *file; - - file = store->file; - map = store->filemap; - pages = store->file_pages; - sb_page = store->sb_page; + struct file *file = store->file; + struct page *sb_page = store->sb_page; + struct page **map = store->filemap; + int pages = store->file_pages; while (pages--) if (map[pages] != sb_page) /* 0 is sb_page, release it below */ From 92348518f23f4fc81caa1a0f7f587566db67b52f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:32 +0200 Subject: [PATCH 031/113] md-bitmap: use %pD to print the file name in md_bitmap_file_kick Don't bother allocating an extra buffer in the I/O failure handler and instead use the printk built-in format to print the last 4 path name components. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-4-hch@lst.de --- drivers/md/md-bitmap.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 0b2d8933cbc75..e4b466522d4e7 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -870,21 +870,13 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store) */ static void md_bitmap_file_kick(struct bitmap *bitmap) { - char *path, *ptr = NULL; - if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { md_bitmap_update_sb(bitmap); if (bitmap->storage.file) { - path = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (path) - ptr = file_path(bitmap->storage.file, - path, PAGE_SIZE); - - pr_warn("%s: kicking failed bitmap file %s from array!\n", - bmname(bitmap), IS_ERR(ptr) ? "" : ptr); + pr_warn("%s: kicking failed bitmap file %pD4 from array!\n", + bmname(bitmap), bitmap->storage.file); - kfree(path); } else pr_warn("%s: disabling internal bitmap due to errors\n", bmname(bitmap)); From 5339178e5303084da7655874d1aa69a0572c9b79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:33 +0200 Subject: [PATCH 032/113] md-bitmap: split file writes into a separate helper Split the file write code out of write_page into a separate helper. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-5-hch@lst.de --- drivers/md/md-bitmap.c | 48 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index e4b466522d4e7..46fbcfc9d1fca 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -296,33 +296,22 @@ static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait) } static void md_bitmap_file_kick(struct bitmap *bitmap); -/* - * write out a page to a file - */ -static void write_page(struct bitmap *bitmap, struct page *page, int wait) -{ - struct buffer_head *bh; - if (bitmap->storage.file == NULL) { - write_sb_page(bitmap, page, wait); - } else { - - bh = page_buffers(page); - - while (bh && bh->b_blocknr) { - atomic_inc(&bitmap->pending_writes); - set_buffer_locked(bh); - set_buffer_mapped(bh); - submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); - bh = bh->b_this_page; - } +static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct buffer_head *bh = page_buffers(page); - if (wait) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); + while (bh && bh->b_blocknr) { + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); + bh = bh->b_this_page; } - if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) - md_bitmap_file_kick(bitmap); + + if (wait) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes) == 0); } static void end_bitmap_write(struct buffer_head *bh, int uptodate) @@ -429,6 +418,17 @@ static int read_page(struct file *file, unsigned long index, * bitmap file superblock operations */ +/* + * write out a page to a file + */ +static void write_page(struct bitmap *bitmap, struct page *page, int wait) +{ + if (bitmap->storage.file) + write_file_page(bitmap, page, wait); + else + write_sb_page(bitmap, page, wait); +} + /* * md_bitmap_wait_writes() should be called before writing any bitmap * blocks, to ensure previous writes, particularly from From d681054c2f67cfc45042c2de25845b06bb89c148 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:34 +0200 Subject: [PATCH 033/113] md-bitmap: rename read_page to read_file_page Make the difference to read_sb_page clear. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-6-hch@lst.de --- drivers/md/md-bitmap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 46fbcfc9d1fca..fa0f6ca7b61b0 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -348,10 +348,8 @@ static void free_buffers(struct page *page) * This usage is similar to how swap files are handled, and allows us * to write to a file with no concerns of memory allocation failing. */ -static int read_page(struct file *file, unsigned long index, - struct bitmap *bitmap, - unsigned long count, - struct page *page) +static int read_file_page(struct file *file, unsigned long index, + struct bitmap *bitmap, unsigned long count, struct page *page) { int ret = 0; struct inode *inode = file_inode(file); @@ -632,7 +630,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; - err = read_page(bitmap->storage.file, 0, + err = read_file_page(bitmap->storage.file, 0, bitmap, bytes, sb_page); } else { err = read_sb_page(bitmap->mddev, @@ -1141,7 +1139,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) count = PAGE_SIZE; page = store->filemap[index]; if (file) - ret = read_page(file, index, bitmap, + ret = read_file_page(file, index, bitmap, count, page); else ret = read_sb_page( From 844dc6691ad5f53a624f4b07bf84037abbb8fce2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:35 +0200 Subject: [PATCH 034/113] md-bitmap: refactor md_bitmap_init_from_disk Split the confusing loop in md_bitmap_init_from_disk that iterates over all chunks but also needs to read and map the pages into three separate loops: one that iterates over the pages to read them, a second optional one to iterate over the pages to mark them invalid if the bitmaps are out of date, and a final one that actually iterates over the chunks. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306160552.smw0qbmb-lkp@intel.com/ Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-7-hch@lst.de --- drivers/md/md-bitmap.c | 141 ++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 71 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index fa0f6ca7b61b0..db5725beaefb6 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1065,33 +1065,31 @@ void md_bitmap_unplug_async(struct bitmap *bitmap) EXPORT_SYMBOL(md_bitmap_unplug_async); static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); -/* * bitmap_init_from_disk -- called at bitmap_create time to initialize - * the in-memory bitmap from the on-disk bitmap -- also, sets up the - * memory mapping of the bitmap file - * Special cases: - * if there's no bitmap file, or if the bitmap file had been - * previously kicked from the array, we mark all the bits as - * 1's in order to cause a full resync. + +/* + * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory + * mapping of the bitmap file. + * + * Special case: If there's no bitmap file, or if the bitmap file had been + * previously kicked from the array, we mark all the bits as 1's in order to + * cause a full resync. * * We ignore all bits for sectors that end earlier than 'start'. - * This is used when reading an out-of-date bitmap... + * This is used when reading an out-of-date bitmap. */ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { - unsigned long i, chunks, index, oldindex, bit, node_offset = 0; - struct page *page = NULL; - unsigned long bit_cnt = 0; - struct file *file; - unsigned long offset; - int outofdate; - int ret = -ENOSPC; - void *paddr; + bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags); + struct mddev *mddev = bitmap->mddev; + unsigned long chunks = bitmap->counts.chunks; struct bitmap_storage *store = &bitmap->storage; + struct file *file = store->file; + unsigned long node_offset = 0; + unsigned long bit_cnt = 0; + unsigned long i; + int ret; - chunks = bitmap->counts.chunks; - file = store->file; - - if (!file && !bitmap->mddev->bitmap_info.offset) { + if (!file && !mddev->bitmap_info.offset) { /* No permanent bitmap - fill with '1s'. */ store->filemap = NULL; store->file_pages = 0; @@ -1106,77 +1104,79 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) return 0; } - outofdate = test_bit(BITMAP_STALE, &bitmap->flags); - if (outofdate) - pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); - if (file && i_size_read(file->f_mapping->host) < store->bytes) { pr_warn("%s: bitmap file too short %lu < %lu\n", bmname(bitmap), (unsigned long) i_size_read(file->f_mapping->host), store->bytes); + ret = -ENOSPC; goto err; } - oldindex = ~0L; - offset = 0; - if (!bitmap->mddev->bitmap_info.external) - offset = sizeof(bitmap_super_t); - - if (mddev_is_clustered(bitmap->mddev)) + if (mddev_is_clustered(mddev)) node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); - for (i = 0; i < chunks; i++) { - int b; - index = file_page_index(&bitmap->storage, i); - bit = file_page_offset(&bitmap->storage, i); - if (index != oldindex) { /* this is a new page, read it in */ - int count; - /* unmap the old page, we're done with it */ - if (index == store->file_pages-1) - count = store->bytes - index * PAGE_SIZE; - else - count = PAGE_SIZE; - page = store->filemap[index]; - if (file) - ret = read_file_page(file, index, bitmap, - count, page); - else - ret = read_sb_page( - bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - page, - index + node_offset, count); + for (i = 0; i < store->file_pages; i++) { + struct page *page = store->filemap[i]; + int count; - if (ret) - goto err; + /* unmap the old page, we're done with it */ + if (i == store->file_pages - 1) + count = store->bytes - i * PAGE_SIZE; + else + count = PAGE_SIZE; + + if (file) + ret = read_file_page(file, i, bitmap, count, page); + else + ret = read_sb_page(mddev, mddev->bitmap_info.offset, + page, i + node_offset, count); + if (ret) + goto err; + } - oldindex = index; + if (outofdate) { + pr_warn("%s: bitmap file is out of date, doing full recovery\n", + bmname(bitmap)); - if (outofdate) { - /* - * if bitmap is out of date, dirty the - * whole page and write it out - */ - paddr = kmap_atomic(page); - memset(paddr + offset, 0xff, - PAGE_SIZE - offset); - kunmap_atomic(paddr); - write_page(bitmap, page, 1); + for (i = 0; i < store->file_pages; i++) { + struct page *page = store->filemap[i]; + unsigned long offset = 0; + void *paddr; + + if (i == 0 && !mddev->bitmap_info.external) + offset = sizeof(bitmap_super_t); + /* + * If the bitmap is out of date, dirty the whole page + * and write it out + */ + paddr = kmap_atomic(page); + memset(paddr + offset, 0xff, PAGE_SIZE - offset); + kunmap_atomic(paddr); + + write_page(bitmap, page, 1); + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) { ret = -EIO; - if (test_bit(BITMAP_WRITE_ERROR, - &bitmap->flags)) - goto err; + goto err; } } + } + + for (i = 0; i < chunks; i++) { + struct page *page = filemap_get_page(&bitmap->storage, i); + unsigned long bit = file_page_offset(&bitmap->storage, i); + void *paddr; + bool was_set; + paddr = kmap_atomic(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) - b = test_bit(bit, paddr); + was_set = test_bit(bit, paddr); else - b = test_bit_le(bit, paddr); + was_set = test_bit_le(bit, paddr); kunmap_atomic(paddr); - if (b) { + + if (was_set) { /* if the disk bit is set, set the memory bit */ int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift >= start); @@ -1185,7 +1185,6 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) needed); bit_cnt++; } - offset = 0; } pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", From 0c3ea5cc8fbdc3515cfb0c47f5a284882f5e4d80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:36 +0200 Subject: [PATCH 035/113] md-bitmap: cleanup read_sb_page Convert read_sb_page to the normal kernel coding style, calculate the target sector only once, and add a local iosize variable to make the call to sync_page_io more readable. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-8-hch@lst.de --- drivers/md/md-bitmap.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index db5725beaefb6..c6dd1fa5a0bee 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -139,26 +139,25 @@ static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page */ /* IO operations when bitmap is stored near all superblocks */ + +/* choose a good rdev and read the page from there */ static int read_sb_page(struct mddev *mddev, loff_t offset, - struct page *page, - unsigned long index, int size) + struct page *page, unsigned long index, int size) { - /* choose a good rdev and read the page from there */ + sector_t sector = offset + index * (PAGE_SIZE / SECTOR_SIZE); struct md_rdev *rdev; - sector_t target; rdev_for_each(rdev, mddev) { - if (! test_bit(In_sync, &rdev->flags) - || test_bit(Faulty, &rdev->flags) - || test_bit(Bitmap_sync, &rdev->flags)) - continue; + u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev)); - target = offset + index * (PAGE_SIZE/512); + if (!test_bit(In_sync, &rdev->flags) || + test_bit(Faulty, &rdev->flags) || + test_bit(Bitmap_sync, &rdev->flags)) + continue; - if (sync_page_io(rdev, target, - roundup(size, bdev_logical_block_size(rdev->bdev)), - page, REQ_OP_READ, true)) { + if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, + true)) { page->index = index; return 0; } From f5f2d5ac9f6e807e080311ec36bdf3d6c45b40d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:37 +0200 Subject: [PATCH 036/113] md-bitmap: account for mddev->bitmap_info.offset in read_sb_page Diretly apply mddev->bitmap_info.offset to the sector number to read instead of doing that in both callers. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Himanshu Madhani Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-9-hch@lst.de --- drivers/md/md-bitmap.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c6dd1fa5a0bee..ae1c6f47b9650 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -145,7 +145,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset, struct page *page, unsigned long index, int size) { - sector_t sector = offset + index * (PAGE_SIZE / SECTOR_SIZE); + sector_t sector = mddev->bitmap_info.offset + offset + + index * (PAGE_SIZE / SECTOR_SIZE); struct md_rdev *rdev; rdev_for_each(rdev, mddev) { @@ -593,7 +594,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) unsigned long sectors_reserved = 0; int err = -EINVAL; struct page *sb_page; - loff_t offset = bitmap->mddev->bitmap_info.offset; + loff_t offset = 0; if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { chunksize = 128 * 1024 * 1024; @@ -620,7 +621,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); /* to 4k blocks */ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); - offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); + offset = bitmap->cluster_slot * (bm_blocks << 3); pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, bitmap->cluster_slot, offset); } @@ -632,10 +633,8 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) err = read_file_page(bitmap->storage.file, 0, bitmap, bytes, sb_page); } else { - err = read_sb_page(bitmap->mddev, - offset, - sb_page, - 0, sizeof(bitmap_super_t)); + err = read_sb_page(bitmap->mddev, offset, sb_page, 0, + sizeof(bitmap_super_t)); } if (err) return err; @@ -1128,8 +1127,8 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) if (file) ret = read_file_page(file, i, bitmap, count, page); else - ret = read_sb_page(mddev, mddev->bitmap_info.offset, - page, i + node_offset, count); + ret = read_sb_page(mddev, 0, page, i + node_offset, + count); if (ret) goto err; } From d7038f951828da19fa9aafddfa087b69032c9687 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:38 +0200 Subject: [PATCH 037/113] md-bitmap: don't use ->index for pages backing the bitmap file The md driver allocates pages for storing the bitmap file data, which are not page cache pages, and then stores the page granularity file offset in page->index, which is a field that isn't really valid except for page cache pages. Use a separate index for the superblock, and use the scheme used at read size to recalculate the index for the bitmap pages instead. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-10-hch@lst.de --- drivers/md/md-bitmap.c | 65 ++++++++++++++++++++++++------------------ drivers/md/md-bitmap.h | 1 + 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index ae1c6f47b9650..a280bfd29f650 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -157,11 +157,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset, test_bit(Bitmap_sync, &rdev->flags)) continue; - if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, - true)) { - page->index = index; + if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true)) return 0; - } } return -EIO; } @@ -225,18 +222,19 @@ static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, } static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, - struct page *page) + unsigned long pg_index, struct page *page) { struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; loff_t sboff, offset = mddev->bitmap_info.offset; - sector_t ps, doff; + sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; unsigned int opt_size = PAGE_SIZE; + sector_t doff; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; - if (page->index == store->file_pages - 1) { + if (pg_index == store->file_pages - 1) { unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) @@ -245,7 +243,6 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, opt_size = optimal_io_size(bdev, last_page_size, size); } - ps = page->index * PAGE_SIZE / SECTOR_SIZE; sboff = rdev->sb_start + offset; doff = rdev->data_offset; @@ -279,7 +276,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return 0; } -static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait) +static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index, + struct page *page, bool wait) { struct mddev *mddev = bitmap->mddev; @@ -287,7 +285,7 @@ static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait) struct md_rdev *rdev = NULL; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - if (__write_sb_page(rdev, bitmap, page) < 0) { + if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) { set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); return; } @@ -397,7 +395,6 @@ static int read_file_page(struct file *file, unsigned long index, blk_cur++; bh = bh->b_this_page; } - page->index = index; wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); @@ -419,12 +416,21 @@ static int read_file_page(struct file *file, unsigned long index, /* * write out a page to a file */ -static void write_page(struct bitmap *bitmap, struct page *page, int wait) +static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index, + bool wait) { - if (bitmap->storage.file) + struct bitmap_storage *store = &bitmap->storage; + struct page *page = store->filemap[pg_index]; + + if (mddev_is_clustered(bitmap->mddev)) { + pg_index += bitmap->cluster_slot * + DIV_ROUND_UP(store->bytes, PAGE_SIZE); + } + + if (store->file) write_file_page(bitmap, page, wait); else - write_sb_page(bitmap, page, wait); + write_sb_page(bitmap, pg_index, page, wait); } /* @@ -481,7 +487,12 @@ void md_bitmap_update_sb(struct bitmap *bitmap) sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> bitmap_info.space); kunmap_atomic(sb); - write_page(bitmap, bitmap->storage.sb_page, 1); + + if (bitmap->storage.file) + write_file_page(bitmap, bitmap->storage.sb_page, 1); + else + write_sb_page(bitmap, bitmap->storage.sb_index, + bitmap->storage.sb_page, 1); } EXPORT_SYMBOL(md_bitmap_update_sb); @@ -533,7 +544,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (bitmap->storage.sb_page == NULL) return -ENOMEM; - bitmap->storage.sb_page->index = 0; + bitmap->storage.sb_index = 0; sb = kmap_atomic(bitmap->storage.sb_page); @@ -810,7 +821,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store, if (store->sb_page) { store->filemap[0] = store->sb_page; pnum = 1; - store->sb_page->index = offset; + store->sb_index = offset; } for ( ; pnum < num_pages; pnum++) { @@ -819,7 +830,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store, store->file_pages = pnum; return -ENOMEM; } - store->filemap[pnum]->index = pnum + offset; } store->file_pages = pnum; @@ -924,6 +934,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) void *kaddr; unsigned long chunk = block >> bitmap->counts.chunkshift; struct bitmap_storage *store = &bitmap->storage; + unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; if (mddev_is_clustered(bitmap->mddev)) @@ -941,9 +952,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) else set_bit_le(bit, kaddr); kunmap_atomic(kaddr); - pr_debug("set file bit %lu page %lu\n", bit, page->index); + pr_debug("set file bit %lu page %lu\n", bit, index); /* record page number so it gets flushed to disk when unplug occurs */ - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); + set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY); } static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) @@ -953,6 +964,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) void *paddr; unsigned long chunk = block >> bitmap->counts.chunkshift; struct bitmap_storage *store = &bitmap->storage; + unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; if (mddev_is_clustered(bitmap->mddev)) @@ -968,8 +980,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) else clear_bit_le(bit, paddr); kunmap_atomic(paddr); - if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); + if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) { + set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING); bitmap->allclean = 0; } } @@ -1021,7 +1033,7 @@ void md_bitmap_unplug(struct bitmap *bitmap) "md bitmap_unplug"); } clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); - write_page(bitmap, bitmap->storage.filemap[i], 0); + filemap_write_page(bitmap, i, false); writing = 1; } } @@ -1153,7 +1165,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) memset(paddr + offset, 0xff, PAGE_SIZE - offset); kunmap_atomic(paddr); - write_page(bitmap, page, 1); + filemap_write_page(bitmap, i, true); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) { ret = -EIO; goto err; @@ -1374,9 +1386,8 @@ void md_bitmap_daemon_work(struct mddev *mddev) break; if (bitmap->storage.filemap && test_and_clear_page_attr(bitmap, j, - BITMAP_PAGE_NEEDWRITE)) { - write_page(bitmap, bitmap->storage.filemap[j], 0); - } + BITMAP_PAGE_NEEDWRITE)) + filemap_write_page(bitmap, j, false); } done: diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 8a3788c9bfef8..bb9eb418780a6 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -201,6 +201,7 @@ struct bitmap { struct file *file; /* backing disk file */ struct page *sb_page; /* cached copy of the bitmap * file superblock */ + unsigned long sb_index; struct page **filemap; /* list of cache pages for * the file */ unsigned long *filemap_attr; /* attributes associated From a34d4ef82c3c4bd8bda817e9fb53ef37c5595ddd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:39 +0200 Subject: [PATCH 038/113] md: make bitmap file support optional The support for write intent bitmaps in files on an external files in md is a hot mess that abuses ->bmap to map file offsets into physical device objects, and also abuses buffer_heads in a creative way. Make this code optional so that MD can be built into future kernels without buffer_head support, and so that we can eventually deprecate it. Note this does not affect the internal bitmap support, which has none of the problems. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Himanshu Madhani Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-11-hch@lst.de --- drivers/md/Kconfig | 10 ++++++++++ drivers/md/md-bitmap.c | 15 +++++++++++++++ drivers/md/md.c | 7 +++++++ 3 files changed, 32 insertions(+) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index b0a22e99bade3..9712ab9bcba52 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -50,6 +50,16 @@ config MD_AUTODETECT If unsure, say Y. +config MD_BITMAP_FILE + bool "MD bitmap file support" + default y + help + If you say Y here, support for write intent bitmaps in files on an + external file system is enabled. This is an alternative to the internal + bitmaps near the MD superblock, and very problematic code that abuses + various kernel APIs and can only work with files on a file system not + actually sitting on the MD device. + config MD_LINEAR tristate "Linear (append) mode (deprecated)" depends on BLK_DEV_MD diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index a280bfd29f650..a58a4c30265e6 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -295,6 +295,7 @@ static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index, static void md_bitmap_file_kick(struct bitmap *bitmap); +#ifdef CONFIG_MD_BITMAP_FILE static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) { struct buffer_head *bh = page_buffers(page); @@ -408,6 +409,20 @@ static int read_file_page(struct file *file, unsigned long index, ret); return ret; } +#else /* CONFIG_MD_BITMAP_FILE */ +static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) +{ +} +static int read_file_page(struct file *file, unsigned long index, + struct bitmap *bitmap, unsigned long count, struct page *page) +{ + return -EIO; +} +static void free_buffers(struct page *page) +{ + put_page(page); +} +#endif /* CONFIG_MD_BITMAP_FILE */ /* * bitmap file superblock operations diff --git a/drivers/md/md.c b/drivers/md/md.c index 320d71537359a..f46996a95b0cf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7043,6 +7043,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ + + if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { + pr_warn("%s: bitmap files not supported by this kernel\n", + mdname(mddev)); + return -EINVAL; + } + f = fget(fd); if (f == NULL) { From 0ae1c9d38426737c39085f919b9b27d2eab3802e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Jun 2023 08:48:40 +0200 Subject: [PATCH 039/113] md: deprecate bitmap file support The support for bitmaps on files is a very bad idea abusing various kernel APIs, and fundamentally requires the file to not be on the actual array without a way to check that this is actually the case. Add a deprecation warning to see if we might be able to eventually drop it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Himanshu Madhani Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230615064840.629492-12-hch@lst.de --- drivers/md/Kconfig | 2 +- drivers/md/md.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 9712ab9bcba52..444517d1a2336 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -51,7 +51,7 @@ config MD_AUTODETECT If unsure, say Y. config MD_BITMAP_FILE - bool "MD bitmap file support" + bool "MD bitmap file support (deprecated)" default y help If you say Y here, support for write intent bitmaps in files on an diff --git a/drivers/md/md.c b/drivers/md/md.c index f46996a95b0cf..f8774b1ef0aab 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7049,6 +7049,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd) mdname(mddev)); return -EINVAL; } + pr_warn("%s: using deprecated bitmap file support\n", + mdname(mddev)); f = fget(fd); From c567c86b90d4715081adfe5eb812141a5b6b4883 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:51:03 +0800 Subject: [PATCH 040/113] md: move initialization and destruction of 'io_acct_set' to md.c 'io_acct_set' is only used for raid0 and raid456, prepare to use it for raid1 and raid10, so that io accounting from different levels can be consistent. By the way, follow up patches will also use this io clone mechanism to make sure 'active_io' represents in flight io, not io that is dispatching, so that mddev_suspend will wait for io to be done as designed. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621165110.1498313-2-yukuai1@huaweicloud.com --- drivers/md/md.c | 27 ++++++++++----------------- drivers/md/md.h | 2 -- drivers/md/raid0.c | 16 ++-------------- drivers/md/raid5.c | 41 +++++++++++------------------------------ 4 files changed, 23 insertions(+), 63 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f8774b1ef0aab..1a0844250b9b0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5876,6 +5876,13 @@ int md_run(struct mddev *mddev) goto exit_bio_set; } + if (!bioset_initialized(&mddev->io_acct_set)) { + err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, + offsetof(struct md_io_acct, bio_clone), 0); + if (err) + goto exit_sync_set; + } + spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { @@ -6053,6 +6060,8 @@ int md_run(struct mddev *mddev) module_put(pers->owner); md_bitmap_destroy(mddev); abort: + bioset_exit(&mddev->io_acct_set); +exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); @@ -6276,6 +6285,7 @@ static void __md_stop(struct mddev *mddev) percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_acct_set); } void md_stop(struct mddev *mddev) @@ -8641,23 +8651,6 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); -int acct_bioset_init(struct mddev *mddev) -{ - int err = 0; - - if (!bioset_initialized(&mddev->io_acct_set)) - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, - offsetof(struct md_io_acct, bio_clone), 0); - return err; -} -EXPORT_SYMBOL_GPL(acct_bioset_init); - -void acct_bioset_exit(struct mddev *mddev) -{ - bioset_exit(&mddev->io_acct_set); -} -EXPORT_SYMBOL_GPL(acct_bioset_exit); - static void md_end_io_acct(struct bio *bio) { struct md_io_acct *md_io_acct = bio->bi_private; diff --git a/drivers/md/md.h b/drivers/md/md.h index 914e6ece9af29..4d771e5d3c717 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -774,8 +774,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); -int acct_bioset_init(struct mddev *mddev); -void acct_bioset_exit(struct mddev *mddev); void md_account_bio(struct mddev *mddev, struct bio **bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d1ac73fcd8529..4106d943aae75 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -377,7 +377,6 @@ static void raid0_free(struct mddev *mddev, void *priv) struct r0conf *conf = priv; free_conf(mddev, conf); - acct_bioset_exit(mddev); } static int raid0_run(struct mddev *mddev) @@ -392,16 +391,11 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; - if (acct_bioset_init(mddev)) { - pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev)); - return -ENOMEM; - } - /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { ret = create_strip_zones(mddev, &conf); if (ret < 0) - goto exit_acct_set; + return ret; mddev->private = conf; } conf = mddev->private; @@ -432,15 +426,9 @@ static int raid0_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) - goto free; + free_conf(mddev, conf); return ret; - -free: - free_conf(mddev, conf); -exit_acct_set: - acct_bioset_exit(mddev); - return ret; } /* diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 85b3004594e03..db3cec2282371 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7787,19 +7787,12 @@ static int raid5_run(struct mddev *mddev) struct md_rdev *rdev; struct md_rdev *journal_dev = NULL; sector_t reshape_offset = 0; - int i, ret = 0; + int i; long long min_offset_diff = 0; int first = 1; - if (acct_bioset_init(mddev)) { - pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev)); + if (mddev_init_writes_pending(mddev) < 0) return -ENOMEM; - } - - if (mddev_init_writes_pending(mddev) < 0) { - ret = -ENOMEM; - goto exit_acct_set; - } if (mddev->recovery_cp != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", @@ -7830,8 +7823,7 @@ static int raid5_run(struct mddev *mddev) (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } if (mddev->reshape_position != MaxSector) { @@ -7856,15 +7848,13 @@ static int raid5_run(struct mddev *mddev) if (journal_dev) { pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } if (mddev->new_level != mddev->level) { pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } old_disks = mddev->raid_disks - mddev->delta_disks; /* reshape_position must be on a new-stripe boundary, and one @@ -7880,8 +7870,7 @@ static int raid5_run(struct mddev *mddev) if (sector_div(here_new, chunk_sectors * new_data_disks)) { pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } reshape_offset = here_new * chunk_sectors; /* here_new is the stripe we will write to */ @@ -7903,8 +7892,7 @@ static int raid5_run(struct mddev *mddev) else if (mddev->ro == 0) { pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } } else if (mddev->reshape_backwards ? (here_new * chunk_sectors + min_offset_diff <= @@ -7914,8 +7902,7 @@ static int raid5_run(struct mddev *mddev) /* Reading from the same stripe as writing to - bad */ pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); /* OK, we should be able to continue; */ @@ -7939,10 +7926,8 @@ static int raid5_run(struct mddev *mddev) else conf = mddev->private; - if (IS_ERR(conf)) { - ret = PTR_ERR(conf); - goto exit_acct_set; - } + if (IS_ERR(conf)) + return PTR_ERR(conf); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { if (!journal_dev) { @@ -8140,10 +8125,7 @@ static int raid5_run(struct mddev *mddev) free_conf(conf); mddev->private = NULL; pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); - ret = -EIO; -exit_acct_set: - acct_bioset_exit(mddev); - return ret; + return -EIO; } static void raid5_free(struct mddev *mddev, void *priv) @@ -8151,7 +8133,6 @@ static void raid5_free(struct mddev *mddev, void *priv) struct r5conf *conf = priv; free_conf(conf); - acct_bioset_exit(mddev); mddev->to_remove = &raid5_attrs_group; } From c687297b884507a4737b747957eda567063901df Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:51:04 +0800 Subject: [PATCH 041/113] md: also clone new io if io accounting is disabled Currently, 'active_io' is grabbed before make_reqeust() is called, and it's dropped immediately make_reqeust() returns. Hence 'active_io' actually means io is dispatching, not io is inflight. For raid0 and raid456 that io accounting is enabled, 'active_io' will also be grabbed when bio is cloned for io accounting, and this 'active_io' is dropped until io is done. Always clone new bio so that 'active_io' will mean that io is inflight, raid1 and raid10 will switch to use this method in later patches. Now that bio will be cloned even if io accounting is disabled, also rename related structure from '*_acct_*' to '*_clone_*'. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621165110.1498313-3-yukuai1@huaweicloud.com --- drivers/md/md.c | 63 +++++++++++++++++++++++----------------------- drivers/md/md.h | 4 +-- drivers/md/raid5.c | 18 ++++++------- 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1a0844250b9b0..abb6167203938 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2306,7 +2306,7 @@ int md_integrity_register(struct mddev *mddev) pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { + bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { /* * No need to handle the failure of bioset_integrity_create, * because the function is called by md_run() -> pers->run(), @@ -5876,9 +5876,9 @@ int md_run(struct mddev *mddev) goto exit_bio_set; } - if (!bioset_initialized(&mddev->io_acct_set)) { - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, - offsetof(struct md_io_acct, bio_clone), 0); + if (!bioset_initialized(&mddev->io_clone_set)) { + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, + offsetof(struct md_io_clone, bio_clone), 0); if (err) goto exit_sync_set; } @@ -6060,7 +6060,7 @@ int md_run(struct mddev *mddev) module_put(pers->owner); md_bitmap_destroy(mddev); abort: - bioset_exit(&mddev->io_acct_set); + bioset_exit(&mddev->io_clone_set); exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: @@ -6285,7 +6285,7 @@ static void __md_stop(struct mddev *mddev) percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - bioset_exit(&mddev->io_acct_set); + bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) @@ -8651,45 +8651,44 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); -static void md_end_io_acct(struct bio *bio) +static void md_end_clone_io(struct bio *bio) { - struct md_io_acct *md_io_acct = bio->bi_private; - struct bio *orig_bio = md_io_acct->orig_bio; - struct mddev *mddev = md_io_acct->mddev; + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; orig_bio->bi_status = bio->bi_status; - bio_end_io_acct(orig_bio, md_io_acct->start_time); + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + bio_put(bio); bio_endio(orig_bio); - percpu_ref_put(&mddev->active_io); } -/* - * Used by personalities that don't already clone the bio and thus can't - * easily add the timestamp to their extended bio structure. - */ -void md_account_bio(struct mddev *mddev, struct bio **bio) +static void md_clone_bio(struct mddev *mddev, struct bio **bio) { struct block_device *bdev = (*bio)->bi_bdev; - struct md_io_acct *md_io_acct; - struct bio *clone; - - if (!blk_queue_io_stat(bdev->bd_disk->queue)) - return; + struct md_io_clone *md_io_clone; + struct bio *clone = + bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); + + md_io_clone = container_of(clone, struct md_io_clone, bio_clone); + md_io_clone->orig_bio = *bio; + md_io_clone->mddev = mddev; + if (blk_queue_io_stat(bdev->bd_disk->queue)) + md_io_clone->start_time = bio_start_io_acct(*bio); + + clone->bi_end_io = md_end_clone_io; + clone->bi_private = md_io_clone; + *bio = clone; +} +void md_account_bio(struct mddev *mddev, struct bio **bio) +{ percpu_ref_get(&mddev->active_io); - - clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set); - md_io_acct = container_of(clone, struct md_io_acct, bio_clone); - md_io_acct->orig_bio = *bio; - md_io_acct->start_time = bio_start_io_acct(*bio); - md_io_acct->mddev = mddev; - - clone->bi_end_io = md_end_io_acct; - clone->bi_private = md_io_acct; - *bio = clone; + md_clone_bio(mddev, bio); } EXPORT_SYMBOL_GPL(md_account_bio); diff --git a/drivers/md/md.h b/drivers/md/md.h index 4d771e5d3c717..8ae9574809763 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -510,7 +510,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ - struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ + struct bio_set io_clone_set; /* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -736,7 +736,7 @@ struct md_thread { void *private; }; -struct md_io_acct { +struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index db3cec2282371..1da9dd3e2f18f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5468,13 +5468,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf, */ static void raid5_align_endio(struct bio *bi) { - struct md_io_acct *md_io_acct = bi->bi_private; - struct bio *raid_bi = md_io_acct->orig_bio; + struct md_io_clone *md_io_clone = bi->bi_private; + struct bio *raid_bi = md_io_clone->orig_bio; struct mddev *mddev; struct r5conf *conf; struct md_rdev *rdev; blk_status_t error = bi->bi_status; - unsigned long start_time = md_io_acct->start_time; + unsigned long start_time = md_io_clone->start_time; bio_put(bi); @@ -5506,7 +5506,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct md_rdev *rdev; sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; - struct md_io_acct *md_io_acct; + struct md_io_clone *md_io_clone; bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { @@ -5544,15 +5544,15 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) } align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, - &mddev->io_acct_set); - md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); + &mddev->io_clone_set); + md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone); raid_bio->bi_next = (void *)rdev; if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue)) - md_io_acct->start_time = bio_start_io_acct(raid_bio); - md_io_acct->orig_bio = raid_bio; + md_io_clone->start_time = bio_start_io_acct(raid_bio); + md_io_clone->orig_bio = raid_bio; align_bio->bi_end_io = raid5_align_endio; - align_bio->bi_private = md_io_acct; + align_bio->bi_private = md_io_clone; align_bio->bi_iter.bi_sector = sector; /* No reshape active, so we can trust rdev->data_offset */ From 05048cbccab79e9fb9030274170ccd710fe69474 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:51:05 +0800 Subject: [PATCH 042/113] raid5: fix missing io accounting in raid5_align_endio() Io will only be accounted as done from raid5_align_endio() if the io succeeded, and io inflight counter will be leaked if such io failed. Fix this problem by switching to use md_account_bio() for io accounting. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621165110.1498313-4-yukuai1@huaweicloud.com --- drivers/md/raid5.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1da9dd3e2f18f..32a87193bad73 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf, */ static void raid5_align_endio(struct bio *bi) { - struct md_io_clone *md_io_clone = bi->bi_private; - struct bio *raid_bi = md_io_clone->orig_bio; - struct mddev *mddev; - struct r5conf *conf; - struct md_rdev *rdev; + struct bio *raid_bi = bi->bi_private; + struct md_rdev *rdev = (void *)raid_bi->bi_next; + struct mddev *mddev = rdev->mddev; + struct r5conf *conf = mddev->private; blk_status_t error = bi->bi_status; - unsigned long start_time = md_io_clone->start_time; bio_put(bi); - - rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; - mddev = rdev->mddev; - conf = mddev->private; - rdev_dec_pending(rdev, conf->mddev); if (!error) { - if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue)) - bio_end_io_acct(raid_bi, start_time); bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct md_rdev *rdev; sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; - struct md_io_clone *md_io_clone; bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { @@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } - align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, - &mddev->io_clone_set); - md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone); + md_account_bio(mddev, &raid_bio); raid_bio->bi_next = (void *)rdev; - if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue)) - md_io_clone->start_time = bio_start_io_acct(raid_bio); - md_io_clone->orig_bio = raid_bio; + align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, + &mddev->bio_set); align_bio->bi_end_io = raid5_align_endio; - align_bio->bi_private = md_io_clone; + align_bio->bi_private = raid_bio; align_bio->bi_iter.bi_sector = sector; /* No reshape active, so we can trust rdev->data_offset */ From bb2a9acefaf9ce5bbc1e70f407e34599233d0243 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:51:06 +0800 Subject: [PATCH 043/113] md/raid1: switch to use md_account_bio() for io accounting Two problems can be fixed this way: 1) 'active_io' will represent inflight io instead of io that is dispatching. 2) If io accounting is enabled or disabled while io is still inflight, bio_start_io_acct() and bio_end_io_acct() is not balanced and io inflight counter will be leaked. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621165110.1498313-5-yukuai1@huaweicloud.com --- drivers/md/raid1.c | 14 ++++++-------- drivers/md/raid1.h | 1 - 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index dd25832eb0452..06fa1580501f0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio) if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - bio_end_io_acct(bio, r1_bio->start_time); bio_endio(bio); } @@ -1303,10 +1301,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } r1_bio->read_disk = rdisk; - - if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); - + if (!r1bio_existed) { + md_account_bio(mddev, &bio); + r1_bio->master_bio = bio; + } read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp, &mddev->bio_set); @@ -1500,8 +1498,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sectors = max_sectors; } - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); + md_account_bio(mddev, &bio); + r1_bio->master_bio = bio; atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 468f189da7a05..14d4211a123a8 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -157,7 +157,6 @@ struct r1bio { sector_t sector; int sectors; unsigned long state; - unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx From 820455238366a78a44a85cc7d58a987e728464d9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:51:07 +0800 Subject: [PATCH 044/113] md/raid10: switch to use md_account_bio() for io accounting Make sure that 'active_io' will represent inflight io instead of io that is dispatching, and io accounting from all levels will be consistent. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621165110.1498313-6-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 20 +++++++++----------- drivers/md/raid10.h | 1 - 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5051149e27bbe..d42e9b7d26083 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (r10_bio->start_time) - bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* * Wake up any possible resync thread that waits for the device @@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, } static void raid10_read_request(struct mddev *mddev, struct bio *bio, - struct r10bio *r10_bio) + struct r10bio *r10_bio, bool io_accounting) { struct r10conf *conf = mddev->private; struct bio *read_bio; @@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; - if (!r10_bio->start_time && - blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + if (io_accounting) { + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; + } read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); r10_bio->devs[slot].bio = read_bio; @@ -1543,8 +1542,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->master_bio = bio; } - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); @@ -1571,12 +1570,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; r10_bio->read_slot = -1; - r10_bio->start_time = 0; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); if (bio_data_dir(bio) == READ) - raid10_read_request(mddev, bio, r10_bio); + raid10_read_request(mddev, bio, r10_bio, true); else raid10_write_request(mddev, bio, r10_bio); } @@ -2985,7 +2983,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) rdev_dec_pending(rdev, mddev); r10_bio->state = 0; - raid10_read_request(mddev, r10_bio->master_bio, r10_bio); + raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false); /* * allow_barrier after re-submit to ensure no sync io * can be issued while regular io pending. diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 63e48b11b5529..2e75e88d08023 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -123,7 +123,6 @@ struct r10bio { sector_t sector; /* virtual sector number */ int sectors; unsigned long state; - unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx From bdf2b52136dd19a55aaf5484cb254498c61383a5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:51:08 +0800 Subject: [PATCH 045/113] md/md-multipath: enable io accounting use md_account_bio() to enable io accounting, also make sure mddev_suspend() will wait for all io to be done. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621165110.1498313-7-yukuai1@huaweicloud.com --- drivers/md/md-multipath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 92c45be203d7e..d22276870283d 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) && md_flush_request(mddev, bio)) return true; + md_account_bio(mddev, &bio); mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); mp_bh->master_bio = bio; From 09f43cb530b03e4d58b35a39e54de658fc8d09b7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:51:09 +0800 Subject: [PATCH 046/113] md/md-linear: enable io accounting use md_account_bio() to enable io accounting, also make sure mddev_suspend() will wait for all io to be done. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621165110.1498313-8-yukuai1@huaweicloud.com --- drivers/md/md-linear.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 4eb72b9dd9336..71ac99646827b 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio = split; } + md_account_bio(mddev, &bio); bio_set_dev(bio, tmp_dev->rdev->bdev); bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - start_sector + data_offset; From dd9a68601409d905810a936a7c4e1241b604013f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:51:10 +0800 Subject: [PATCH 047/113] md/md-faulty: enable io accounting use md_account_bio() to enable io accounting, also make sure mddev_suspend() will wait for all io to be done. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621165110.1498313-9-yukuai1@huaweicloud.com --- drivers/md/md-faulty.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c index 50ad818978a43..a039e8e20f55a 100644 --- a/drivers/md/md-faulty.c +++ b/drivers/md/md-faulty.c @@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) failit = 1; } } + + md_account_bio(mddev, &bio); if (failit) { struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); From ffb1e7a03f966065323b18c96da23a2118a19529 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Tue, 27 Jun 2023 09:43:32 +0800 Subject: [PATCH 048/113] md/raid1: prioritize adding disk to 'removed' mirror New disk should be added to "removed" position first instead of to be a replacement. Commit 6090368abcb4 ("md/raid10: prioritize adding disk to 'removed' mirror") has fixed this issue for raid10. Fix it for raid1 now. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230627014332.3810102-1-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 06fa1580501f0..f834d99a36f6d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1764,7 +1764,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; int err = -EEXIST; - int mirror = 0; + int mirror = 0, repl_slot = -1; struct raid1_info *p; int first = 0; int last = conf->raid_disks - 1; @@ -1807,17 +1807,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } if (test_bit(WantReplacement, &p->rdev->flags) && - p[conf->raid_disks].rdev == NULL) { - /* Add this device as a replacement */ - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = mirror; - err = 0; - conf->fullsync = 1; - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); - break; - } + p[conf->raid_disks].rdev == NULL && repl_slot < 0) + repl_slot = mirror; } + + if (err && repl_slot >= 0) { + /* Add this device as a replacement */ + p = conf->mirrors + repl_slot; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = repl_slot; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + } + print_conf(conf); return err; } From 605eeda6e70f692311b36180f217208d367476f6 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 24 Jun 2023 01:32:34 +0800 Subject: [PATCH 049/113] md/raid10: optimize fix_read_error We dereference r10_bio->read_slot too many times in fix_read_error(). Optimize it by using a variable to store read_slot. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230623173236.2513554-2-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d42e9b7d26083..abea91a54db1d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2718,10 +2718,10 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) { int sect = 0; /* Offset from r10_bio->sector */ - int sectors = r10_bio->sectors; + int sectors = r10_bio->sectors, slot = r10_bio->read_slot; struct md_rdev *rdev; int max_read_errors = atomic_read(&mddev->max_corr_read_errors); - int d = r10_bio->devs[r10_bio->read_slot].devnum; + int d = r10_bio->devs[slot].devnum; /* still own a reference to this rdev, so it cannot * have been cleared recently. @@ -2742,13 +2742,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 pr_notice("md/raid10:%s: %pg: Failing raid device\n", mdname(mddev), rdev->bdev); md_error(mddev, rdev); - r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; + r10_bio->devs[slot].bio = IO_BLOCKED; return; } while(sectors) { int s = sectors; - int sl = r10_bio->read_slot; + int sl = slot; int success = 0; int start; @@ -2783,7 +2783,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 sl++; if (sl == conf->copies) sl = 0; - } while (!success && sl != r10_bio->read_slot); + } while (!success && sl != slot); rcu_read_unlock(); if (!success) { @@ -2791,16 +2791,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 * as bad on the first device to discourage future * reads. */ - int dn = r10_bio->devs[r10_bio->read_slot].devnum; + int dn = r10_bio->devs[slot].devnum; rdev = conf->mirrors[dn].rdev; if (!rdev_set_badblocks( rdev, - r10_bio->devs[r10_bio->read_slot].addr + r10_bio->devs[slot].addr + sect, s, 0)) { md_error(mddev, rdev); - r10_bio->devs[r10_bio->read_slot].bio + r10_bio->devs[slot].bio = IO_BLOCKED; } break; @@ -2809,7 +2809,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 start = sl; /* write it back and re-read */ rcu_read_lock(); - while (sl != r10_bio->read_slot) { + while (sl != slot) { if (sl==0) sl = conf->copies; sl--; @@ -2843,7 +2843,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rcu_read_lock(); } sl = start; - while (sl != r10_bio->read_slot) { + while (sl != slot) { if (sl==0) sl = conf->copies; sl--; From 02c67a3b72b13951c2ca134bd7065f03ec57946d Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 24 Jun 2023 01:32:35 +0800 Subject: [PATCH 050/113] md: remove redundant check in fix_read_error() In fix_read_error(), 'success' will be checked immediately after assigning it, if it is set to 1 then the loop will break. Checking it again in condition of loop is redundant. Clean it up. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230623173236.2513554-3-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f834d99a36f6d..a68c9cccbf0d2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2301,7 +2301,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, d++; if (d == conf->raid_disks * 2) d = 0; - } while (!success && d != read_disk); + } while (d != read_disk); if (!success) { /* Cannot read from anywhere - mark it bad */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index abea91a54db1d..757687fb90a7a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2783,7 +2783,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 sl++; if (sl == conf->copies) sl = 0; - } while (!success && sl != slot); + } while (sl != slot); rcu_read_unlock(); if (!success) { From b39f35ebe86d88788d85f61e83c81c308cb76727 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 28 Jun 2023 09:29:30 +0800 Subject: [PATCH 051/113] md: don't quiesce in mddev_suspend() Some levels doesn't implement "pers->quiesce", for example raid0_quiesce() is empty, and now that all levels will drop 'active_io' until io is done, wait for 'active_io' to be 0 is enough to make sure all normal io is done, and percpu_ref_kill() for 'active_io' will make sure no new normal io can be dispatched. There is no need to call "pers->quiesce" anymore from mddev_suspend(). Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20230628012931.88911-2-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index abb6167203938..962dacfd98cf8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -453,7 +453,6 @@ void mddev_suspend(struct mddev *mddev) mddev->pers->prepare_suspend(mddev); wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); - mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); @@ -472,7 +471,6 @@ void mddev_resume(struct mddev *mddev) return; percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 0); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); From e24ed04389f9619e0aaef615a8948633c182a8b0 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 28 Jun 2023 09:29:31 +0800 Subject: [PATCH 052/113] md: restore 'noio_flag' for the last mddev_resume() memalloc_noio_save() is called for the first mddev_suspend(), and repeated mddev_suspend() only increase 'suspended'. However, memalloc_noio_restore() is also called for the first mddev_resume(), which means that memory reclaim will be enabled before the last mddev_resume() is called, while the array is still suspended. Fix this problem by restore 'noio_flag' for the last mddev_resume(). Fixes: 78f57ef9d50a ("md: use memalloc scope APIs in mddev_suspend()/mddev_resume()") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20230628012931.88911-3-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 962dacfd98cf8..a3d98273b295c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -464,11 +464,13 @@ EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { - /* entred the memalloc scope from mddev_suspend() */ - memalloc_noio_restore(mddev->noio_flag); lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; + + /* entred the memalloc scope from mddev_suspend() */ + memalloc_noio_restore(mddev->noio_flag); + percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); From 21bd9a68fef47c4f0e951be9a6fac9745cee1bab Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 5 Jul 2023 13:32:27 +0200 Subject: [PATCH 053/113] md/raid1: Avoid lock contention from wake_up() wake_up is called unconditionally in a few paths such as make_request(), which cause lock contention under high concurrency workload like below raid1_end_write_request wake_up __wake_up_common_lock spin_lock_irqsave Improve performance by only call wake_up() if waitqueue is not empty Fio test script: [global] name=random reads and writes ioengine=libaio direct=1 readwrite=randrw rwmixread=70 iodepth=64 buffered=0 filename=/dev/md0 size=1G runtime=30 time_based randrepeat=0 norandommap refill_buffers ramp_time=10 bs=4k numjobs=400 group_reporting=1 [job1] Test result with 2 ramdisk in raid1 on a Intel Broadwell 56 cores server. Before this patch With this patch READ BW=4621MB/s BW=7337MB/s WRITE BW=1980MB/s BW=3144MB/s The patch is inspired by Yu Kuai's change for raid10: https://lore.kernel.org/r/20230621105728.1268542-1-yukuai1@huaweicloud.com Cc: Yu Kuai Signed-off-by: Jack Wang Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230705113227.148494-1-jinpu.wang@ionos.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a68c9cccbf0d2..23d211969565e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -789,11 +789,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } +static void wake_up_barrier(struct r1conf *conf) +{ + if (wq_has_sleeper(&conf->wait_barrier)) + wake_up(&conf->wait_barrier); +} + static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ raid1_prepare_flush_writes(conf->mddev->bitmap); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; @@ -970,7 +976,7 @@ static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait) * In case freeze_array() is waiting for * get_unqueued_pending() == extra */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); /* Wait for the barrier in same barrier unit bucket to drop. */ /* Return false when nowait flag is set */ @@ -1013,7 +1019,7 @@ static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowa * In case freeze_array() is waiting for * get_unqueued_pending() == extra */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); /* Wait for array to be unfrozen */ /* Return false when nowait flag is set */ @@ -1042,7 +1048,7 @@ static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait) static void _allow_barrier(struct r1conf *conf, int idx) { atomic_dec(&conf->nr_pending[idx]); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); } static void allow_barrier(struct r1conf *conf, sector_t sector_nr) @@ -1171,7 +1177,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1574,7 +1580,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio_write_done(r1_bio); /* In case raid1d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); } static bool raid1_make_request(struct mddev *mddev, struct bio *bio) From 7e85c41b9e1df9192c225afd7cfec8dcad137feb Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 1 Jul 2023 16:05:27 +0800 Subject: [PATCH 054/113] md/raid10: check replacement and rdev to prevent submit the same io twice After commit 4ca40c2ce099 ("md/raid10: Allow replacement device to be replace old drive."), 'rdev' and 'replacement' could appear to be identical. There are already checks for that in wait_blocked_dev() and raid10_write_request(). Add check for raid10_handle_discard() now. Signed-off-by: Li Nan Link: https://lore.kernel.org/r/20230701080529.2684932-2-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 757687fb90a7a..60963449d3f50 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1785,6 +1785,8 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) r10_bio->devs[disk].bio = NULL; r10_bio->devs[disk].repl_bio = NULL; + if (rdev == rrdev) + rrdev = NULL; if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) From b99f8fd2d91eb734f13098aa1cf337edaca454b7 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 1 Jul 2023 16:05:28 +0800 Subject: [PATCH 055/113] md/raid10: factor out dereference_rdev_and_rrdev() Factor out a helper to get 'rdev' and 'replacement' from config->mirrors. Just to make code cleaner and prepare to fix the bug of io loss while 'replacement' replace 'rdev'. There is no functional change. Signed-off-by: Li Nan Link: https://lore.kernel.org/r/20230701080529.2684932-3-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 60963449d3f50..d21aeb9546d9f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1321,6 +1321,25 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } +static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror, + struct md_rdev **prrdev) +{ + struct md_rdev *rdev, *rrdev; + + rrdev = rcu_dereference(mirror->replacement); + /* + * Read replacement first to prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(mirror->rdev); + if (rdev == rrdev) + rrdev = NULL; + + *prrdev = rrdev; + return rdev; +} + static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) { int i; @@ -1464,15 +1483,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int d = r10_bio->devs[i].devnum; struct md_rdev *rdev, *rrdev; - rrdev = rcu_dereference(conf->mirrors[d].replacement); - /* - * Read replacement first to prevent reading both rdev and - * replacement as NULL during replacement replace rdev. - */ - smp_mb(); - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev == rrdev) - rrdev = NULL; + rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev); if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) From 673643490b9a0eb3b25633abe604f62b8f63dba1 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 1 Jul 2023 16:05:29 +0800 Subject: [PATCH 056/113] md/raid10: use dereference_rdev_and_rrdev() to get devices Commit 2ae6aaf76912 ("md/raid10: fix io loss while replacement replace rdev") reads replacement first to prevent io loss. However, there are same issue in wait_blocked_dev() and raid10_handle_discard(), too. Fix it by using dereference_rdev_and_rrdev() to get devices. Fixes: d30588b2731f ("md/raid10: improve raid10 discard request") Fixes: f2e7e269a752 ("md/raid10: pull the code that wait for blocked dev into one function") Signed-off-by: Li Nan Link: https://lore.kernel.org/r/20230701080529.2684932-4-linan666@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d21aeb9546d9f..16aa9d735880a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1350,11 +1350,9 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < conf->copies; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[i].replacement); - if (rdev == rrdev) - rrdev = NULL; + struct md_rdev *rdev, *rrdev; + + rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev); if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; @@ -1789,15 +1787,12 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) */ rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); + struct md_rdev *rdev, *rrdev; + rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev); r10_bio->devs[disk].bio = NULL; r10_bio->devs[disk].repl_bio = NULL; - if (rdev == rrdev) - rrdev = NULL; if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) From b4d129640f194ffc4cc64c3e97f98ae944c072e8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 6 Jul 2023 16:37:26 +0800 Subject: [PATCH 057/113] md/md-bitmap: remove unnecessary local variable in backlog_store() Local variable is definied first in the beginning of backlog_store(), there is no need to define it again. Fixes: 8c13ab115b57 ("md/bitmap: don't set max_write_behind if there is no write mostly device") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20230706083727.608914-2-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index a58a4c30265e6..1f33399611592 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2569,8 +2569,6 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev_destroy_serial_pool(mddev, NULL, false); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ - struct md_rdev *rdev; - rdev_for_each(rdev, mddev) mddev_create_serial_pool(mddev, rdev, false); } From 44abfa6a95df425c0660d56043020b67e6d93ab8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 6 Jul 2023 16:37:27 +0800 Subject: [PATCH 058/113] md/md-bitmap: hold 'reconfig_mutex' in backlog_store() Several reasons why 'reconfig_mutex' should be held: 1) rdev_for_each() is not safe to be called without the lock, because rdev can be removed concurrently. 2) mddev_destroy_serial_pool() and mddev_create_serial_pool() should not be called concurrently. 3) mddev_suspend() from mddev_destroy/create_serial_pool() should be protected by the lock. Fixes: 10c92fca636e ("md-bitmap: create and destroy wb_info_pool with the change of backlog") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20230706083727.608914-3-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 1f33399611592..6f9ff14971f98 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2546,6 +2546,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; + rv = mddev_lock(mddev); + if (rv) + return rv; + /* * Without write mostly device, it doesn't make sense to set * backlog for max_write_behind. @@ -2559,6 +2563,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (!has_write_mostly) { pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n", mdname(mddev)); + mddev_unlock(mddev); return -EINVAL; } @@ -2574,6 +2579,8 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); + + mddev_unlock(mddev); return len; } From 4a8b719f95c0dcd15fb7a04b806ad8139fa7c850 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:21:56 +0200 Subject: [PATCH 059/113] fs: remove emergency_thaw_bdev Fold emergency_thaw_bdev into it's only caller, to prepare for buffer.c to be built only when buffer_head support is enabled. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20230801172201.1923299-2-hch@lst.de Signed-off-by: Jens Axboe --- fs/buffer.c | 6 ------ fs/internal.h | 6 ------ fs/super.c | 4 +++- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index bd091329026c0..376f468e16662 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -562,12 +562,6 @@ static int osync_buffers_list(spinlock_t *lock, struct list_head *list) return err; } -void emergency_thaw_bdev(struct super_block *sb) -{ - while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) - printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); -} - /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written diff --git a/fs/internal.h b/fs/internal.h index f7a3dc1110264..d538d832fd608 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,16 +23,10 @@ struct mnt_idmap; */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); - -void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } -static inline int emergency_thaw_bdev(struct super_block *sb) -{ - return 0; -} #endif /* CONFIG_BLOCK */ /* diff --git a/fs/super.c b/fs/super.c index e781226e28800..bc666e7ee1a98 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1029,7 +1029,9 @@ static void do_thaw_all_callback(struct super_block *sb) { down_write(&sb->s_umount); if (sb->s_root && sb->s_flags & SB_BORN) { - emergency_thaw_bdev(sb); + if (IS_ENABLED(CONFIG_BLOCK)) + while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) + pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb); } else { up_write(&sb->s_umount); From 2ba39cc46bfe463cb9673bf62a04c4c21942f1f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:21:57 +0200 Subject: [PATCH 060/113] fs: rename and move block_page_mkwrite_return block_page_mkwrite_return is neither block nor mkwrite specific, and should not be under CONFIG_BLOCK. Move it to mm.h and rename it to vmf_fs_error. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20230801172201.1923299-3-hch@lst.de Signed-off-by: Jens Axboe --- fs/ext4/inode.c | 2 +- fs/f2fs/file.c | 2 +- fs/gfs2/file.c | 16 ++++++++-------- fs/iomap/buffered-io.c | 2 +- fs/nilfs2/file.c | 2 +- fs/udf/file.c | 2 +- include/linux/buffer_head.h | 12 ------------ include/linux/mm.h | 18 ++++++++++++++++++ 8 files changed, 31 insertions(+), 25 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca5054..6eea0886b8855 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -6140,7 +6140,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 093039dee9920..9b3871fb9bfc4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -159,7 +159,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) sb_end_pagefault(inode->i_sb); err: - return block_page_mkwrite_return(err); + return vmf_fs_error(err); } static const struct vm_operations_struct f2fs_file_vm_ops = { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 1bf3c4453516f..897ef62d6d77a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -432,7 +432,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_uninit; } @@ -474,7 +474,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) err = gfs2_rindex_update(sdp); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } @@ -482,12 +482,12 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) ap.target = data_blocks + ind_blocks; err = gfs2_quota_lock_check(ip, &ap); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } err = gfs2_inplace_reserve(ip, &ap); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_quota_unlock; } @@ -500,7 +500,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) } err = gfs2_trans_begin(sdp, rblocks, 0); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_trans_fail; } @@ -508,7 +508,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (gfs2_is_stuffed(ip)) { err = gfs2_unstuff_dinode(ip); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_trans_end; } } @@ -524,7 +524,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) err = gfs2_allocate_page_backing(page, length); if (err) - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); out_page_locked: if (ret != VM_FAULT_LOCKED) @@ -558,7 +558,7 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf) gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_uninit; } ret = filemap_fault(vmf); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index adb92cdb24b00..0607790827b48 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1286,7 +1286,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); - return block_page_mkwrite_return(ret); + return vmf_fs_error(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index a9eb3487efb2c..740ce26d1e765 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -108,7 +108,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); - return block_page_mkwrite_return(ret); + return vmf_fs_error(ret); } static const struct vm_operations_struct nilfs_file_vm_ops = { diff --git a/fs/udf/file.c b/fs/udf/file.c index 243840dc83add..c0e2080e639ee 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -67,7 +67,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) err = block_commit_write(page, 0, end); if (err < 0) { unlock_page(page); - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } out_dirty: diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6cb3e9af78c9e..7002a9ff63a3d 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -291,18 +291,6 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); -/* Convert errno to return value from ->page_mkwrite() call */ -static inline vm_fault_t block_page_mkwrite_return(int err) -{ - if (err == 0) - return VM_FAULT_LOCKED; - if (err == -EFAULT || err == -EAGAIN) - return VM_FAULT_NOPAGE; - if (err == -ENOMEM) - return VM_FAULT_OOM; - /* -ENOSPC, -EDQUOT, -EIO ... */ - return VM_FAULT_SIGBUS; -} sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2dd73e4f3d8e3..75777eae1c9c2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3386,6 +3386,24 @@ static inline vm_fault_t vmf_error(int err) return VM_FAULT_SIGBUS; } +/* + * Convert errno to return value for ->page_mkwrite() calls. + * + * This should eventually be merged with vmf_error() above, but will need a + * careful audit of all vmf_error() callers. + */ +static inline vm_fault_t vmf_fs_error(int err) +{ + if (err == 0) + return VM_FAULT_LOCKED; + if (err == -EFAULT || err == -EAGAIN) + return VM_FAULT_NOPAGE; + if (err == -ENOMEM) + return VM_FAULT_OOM; + /* -ENOSPC, -EDQUOT, -EIO ... */ + return VM_FAULT_SIGBUS; +} + struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags); From 727cfe976758b79f8d2f8051c75a5ccb14539a56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:21:58 +0200 Subject: [PATCH 061/113] block: open code __generic_file_write_iter for blkdev writes Open code __generic_file_write_iter to remove the indirect call into ->direct_IO and to prepare using the iomap based write code. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20230801172201.1923299-4-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/block/fops.c b/block/fops.c index a286bf3325c5d..8a05d99166e3b 100644 --- a/block/fops.c +++ b/block/fops.c @@ -533,6 +533,30 @@ static int blkdev_release(struct inode *inode, struct file *filp) return 0; } +static ssize_t +blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + size_t count = iov_iter_count(from); + ssize_t written; + + written = kiocb_invalidate_pages(iocb, count); + if (written) { + if (written == -EBUSY) + return 0; + return written; + } + + written = blkdev_direct_IO(iocb, from); + if (written > 0) { + kiocb_invalidate_post_direct_write(iocb, count); + iocb->ki_pos += written; + count -= written; + } + if (written != -EIOCBQUEUED) + iov_iter_revert(from, count - iov_iter_count(from)); + return written; +} + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -542,7 +566,8 @@ static int blkdev_release(struct inode *inode, struct file *filp) */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct inode *bd_inode = bdev->bd_inode; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; @@ -569,7 +594,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, size); } - ret = __generic_file_write_iter(iocb, from); + ret = file_remove_privs(file); + if (ret) + return ret; + + ret = file_update_time(file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = blkdev_direct_write(iocb, from); + if (ret >= 0 && iov_iter_count(from)) + ret = direct_write_fallback(iocb, from, ret, + generic_perform_write(iocb, from)); + } else { + ret = generic_perform_write(iocb, from); + } + if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); From a05f7bd9578b17521a9a5f3689f3934c082c6390 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:21:59 +0200 Subject: [PATCH 062/113] block: stop setting ->direct_IO Direct I/O on block devices now nevers goes through aops->direct_IO. Stop setting it and set the FMODE_CAN_ODIRECT in ->open instead. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230801172201.1923299-5-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/fops.c b/block/fops.c index 8a05d99166e3b..f0b822c28ddfe 100644 --- a/block/fops.c +++ b/block/fops.c @@ -428,7 +428,6 @@ const struct address_space_operations def_blk_aops = { .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .direct_IO = blkdev_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; @@ -505,7 +504,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; /* * Use the file private data to store the holder for exclusive openes. From 487c607df790d366e67a7d6a30adf785cdd98e55 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:22:00 +0200 Subject: [PATCH 063/113] block: use iomap for writes to block devices Use iomap in buffer_head compat mode to write to block devices. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Pankaj Raghav Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20230801172201.1923299-6-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 1 + block/fops.c | 31 +++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index 86122e459fe04..1a13ef0b1ca10 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -5,6 +5,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select FS_IOMAP select SBITMAP help Provide block layer support for the kernel. diff --git a/block/fops.c b/block/fops.c index f0b822c28ddfe..063ece37d44e4 100644 --- a/block/fops.c +++ b/block/fops.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "blk.h" @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct block_device *bdev = I_BDEV(inode); + loff_t isize = i_size_read(inode); + + iomap->bdev = bdev; + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); + if (iomap->offset >= isize) + return -EIO; + iomap->type = IOMAP_MAPPED; + iomap->addr = iomap->offset; + iomap->length = isize - iomap->offset; + iomap->flags |= IOMAP_F_BUFFER_HEAD; + return 0; +} + +static const struct iomap_ops blkdev_iomap_ops = { + .iomap_begin = blkdev_iomap_begin, +}; + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); @@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) return written; } +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) +{ + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); +} + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = blkdev_direct_write(iocb, from); if (ret >= 0 && iov_iter_count(from)) ret = direct_write_fallback(iocb, from, ret, - generic_perform_write(iocb, from)); + blkdev_buffered_write(iocb, from)); } else { - ret = generic_perform_write(iocb, from); + ret = blkdev_buffered_write(iocb, from); } if (ret > 0) From 925c86a19bacf8ce10eb666328fb3fa5aff7b951 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:22:01 +0200 Subject: [PATCH 064/113] fs: add CONFIG_BUFFER_HEAD Add a new config option that controls building the buffer_head code, and select it from all file systems and stacking drivers that need it. For the block device nodes and alternative iomap based buffered I/O path is provided when buffer_head support is not enabled, and iomap needs a a small tweak to define the IOMAP_F_BUFFER_HEAD flag to 0 to not call into the buffer_head code when it doesn't exist. Otherwise this is just Kconfig and ifdef changes. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230801172201.1923299-7-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 70 ++++++++++++++++++++++++++++++------ drivers/md/Kconfig | 1 + fs/Kconfig | 4 +++ fs/Makefile | 2 +- fs/adfs/Kconfig | 1 + fs/affs/Kconfig | 1 + fs/befs/Kconfig | 1 + fs/bfs/Kconfig | 1 + fs/efs/Kconfig | 1 + fs/exfat/Kconfig | 1 + fs/ext2/Kconfig | 1 + fs/ext4/Kconfig | 1 + fs/f2fs/Kconfig | 1 + fs/fat/Kconfig | 1 + fs/freevxfs/Kconfig | 1 + fs/gfs2/Kconfig | 1 + fs/hfs/Kconfig | 1 + fs/hfsplus/Kconfig | 1 + fs/hpfs/Kconfig | 1 + fs/isofs/Kconfig | 1 + fs/jfs/Kconfig | 1 + fs/minix/Kconfig | 1 + fs/nilfs2/Kconfig | 1 + fs/ntfs/Kconfig | 1 + fs/ntfs3/Kconfig | 1 + fs/ocfs2/Kconfig | 1 + fs/omfs/Kconfig | 1 + fs/qnx4/Kconfig | 1 + fs/qnx6/Kconfig | 1 + fs/reiserfs/Kconfig | 1 + fs/sysv/Kconfig | 1 + fs/udf/Kconfig | 1 + fs/ufs/Kconfig | 1 + include/linux/buffer_head.h | 32 ++++++++--------- include/linux/iomap.h | 4 +++ include/trace/events/block.h | 2 ++ mm/migrate.c | 4 +-- 37 files changed, 119 insertions(+), 29 deletions(-) diff --git a/block/fops.c b/block/fops.c index 063ece37d44e4..eaa98a987213d 100644 --- a/block/fops.c +++ b/block/fops.c @@ -24,15 +24,6 @@ static inline struct inode *bdev_file_inode(struct file *file) return file->f_mapping->host; } -static int blkdev_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - set_buffer_mapped(bh); - return 0; -} - static blk_opf_t dio_bio_write_op(struct kiocb *iocb) { blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; @@ -400,7 +391,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; - iomap->flags |= IOMAP_F_BUFFER_HEAD; + iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ return 0; } @@ -408,6 +399,16 @@ static const struct iomap_ops blkdev_iomap_ops = { .iomap_begin = blkdev_iomap_begin, }; +#ifdef CONFIG_BUFFER_HEAD +static int blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); @@ -453,6 +454,55 @@ const struct address_space_operations def_blk_aops = { .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; +#else /* CONFIG_BUFFER_HEAD */ +static int blkdev_read_folio(struct file *file, struct folio *folio) +{ + return iomap_read_folio(folio, &blkdev_iomap_ops); +} + +static void blkdev_readahead(struct readahead_control *rac) +{ + iomap_readahead(rac, &blkdev_iomap_ops); +} + +static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) +{ + loff_t isize = i_size_read(inode); + + if (WARN_ON_ONCE(offset >= isize)) + return -EIO; + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + return blkdev_iomap_begin(inode, offset, isize - offset, + IOMAP_WRITE, &wpc->iomap, NULL); +} + +static const struct iomap_writeback_ops blkdev_writeback_ops = { + .map_blocks = blkdev_map_blocks, +}; + +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); +} + +const struct address_space_operations def_blk_aops = { + .dirty_folio = filemap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .read_folio = blkdev_read_folio, + .readahead = blkdev_readahead, + .writepages = blkdev_writepages, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, + .migrate_folio = filemap_migrate_folio, +}; +#endif /* CONFIG_BUFFER_HEAD */ /* * for a block special file file_inode(file)->i_size is zero diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 444517d1a2336..2a8b081bce7dd 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -15,6 +15,7 @@ if MD config BLK_DEV_MD tristate "RAID support" select BLOCK_HOLDER_DEPRECATED if SYSFS + select BUFFER_HEAD # BLOCK_LEGACY_AUTOLOAD requirement should be removed # after relevant mdadm enhancements - to make "names=yes" # the default - are widely available. diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec79539..e8b17c81b83a8 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +config BUFFER_HEAD + bool + # old blockdev_direct_IO implementation. Use iomap for new code instead config LEGACY_DIRECT_IO + depends on BUFFER_HEAD bool if BLOCK diff --git a/fs/Makefile b/fs/Makefile index e513aaee0603a..f9541f40be4e0 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o -obj-$(CONFIG_BLOCK) += buffer.o mpage.o +obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o obj-y += notify/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index 44738fed66251..1b97058f0c4a9 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -2,6 +2,7 @@ config ADFS_FS tristate "ADFS file system support" depends on BLOCK + select BUFFER_HEAD help The Acorn Disc Filing System is the standard file system of the RiscOS operating system which runs on Acorn's ARM-based Risc PC diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 962b86374e1c1..1ae432d266c32 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -2,6 +2,7 @@ config AFFS_FS tristate "Amiga FFS file system support" depends on BLOCK + select BUFFER_HEAD select LEGACY_DIRECT_IO help The Fast File System (FFS) is the common file system used on hard diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig index 9550b6462b814..5fcfc4024ffe6 100644 --- a/fs/befs/Kconfig +++ b/fs/befs/Kconfig @@ -2,6 +2,7 @@ config BEFS_FS tristate "BeOS file system (BeFS) support (read only)" depends on BLOCK + select BUFFER_HEAD select NLS help The BeOS File System (BeFS) is the native file system of Be, Inc's diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3a757805b5856..8e7ef866b62a6 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -2,6 +2,7 @@ config BFS_FS tristate "BFS file system support" depends on BLOCK + select BUFFER_HEAD help Boot File System (BFS) is a file system used under SCO UnixWare to allow the bootloader access to the kernel image and other important diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig index 2df1bac8b375b..0833e533df9d5 100644 --- a/fs/efs/Kconfig +++ b/fs/efs/Kconfig @@ -2,6 +2,7 @@ config EFS_FS tristate "EFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help EFS is an older file system used for non-ISO9660 CD-ROMs and hard disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig index 147edeb044691..cbeca8e44d9b3 100644 --- a/fs/exfat/Kconfig +++ b/fs/exfat/Kconfig @@ -2,6 +2,7 @@ config EXFAT_FS tristate "exFAT filesystem support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 77393fda99af0..74d98965902e1 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EXT2_FS tristate "Second extended fs support" + select BUFFER_HEAD select FS_IOMAP select LEGACY_DIRECT_IO help diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 86699c8cab281..e20d59221fc05 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -28,6 +28,7 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" + select BUFFER_HEAD select JBD2 select CRC16 select CRYPTO diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 03ef087537c7c..68a1e23e1557c 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,6 +2,7 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select BUFFER_HEAD select NLS select CRYPTO select CRYPTO_CRC32 diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index afe83b4e71728..25fae1c83725b 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config FAT_FS tristate + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index 0e2fc08f7de49..912107ebea6f4 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -2,6 +2,7 @@ config VXFS_FS tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" depends on BLOCK + select BUFFER_HEAD help FreeVxFS is a file system driver that support the VERITAS VxFS(TM) file system format. VERITAS VxFS(TM) is the standard file system diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 03c966840422e..be7f87a8e11ae 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config GFS2_FS tristate "GFS2 file system support" + select BUFFER_HEAD select FS_POSIX_ACL select CRC32 select LIBCRC32C diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index d985066006d58..5ea5cd8ecea9c 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -2,6 +2,7 @@ config HFS_FS tristate "Apple Macintosh file system support" depends on BLOCK + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 8034e7827a690..8ce4a33a9ac78 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -2,6 +2,7 @@ config HFSPLUS_FS tristate "Apple Extended HFS file system support" depends on BLOCK + select BUFFER_HEAD select NLS select NLS_UTF8 select LEGACY_DIRECT_IO diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index ec975f4668775..ac1e9318e65a4 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -2,6 +2,7 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK + select BUFFER_HEAD select FS_IOMAP help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 08ffd37b9bb8f..51434f2a471b0 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config ISO9660_FS tristate "ISO 9660 CDROM file system support" + select BUFFER_HEAD help This is the standard file system used on CD-ROMs. It was previously known as "High Sierra File System" and is called "hsfs" on other diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 51e856f0e4b8d..17488440eef1a 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config JFS_FS tristate "JFS filesystem support" + select BUFFER_HEAD select NLS select CRC32 select LEGACY_DIRECT_IO diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index de2003974ff0d..90ddfad2a75e8 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -2,6 +2,7 @@ config MINIX_FS tristate "Minix file system support" depends on BLOCK + select BUFFER_HEAD help Minix is a simple operating system used in many classes about OS's. The minix file system (method to organize files on a hard disk diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 7d59567465e12..7dae168e346e3 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NILFS2_FS tristate "NILFS2 file system support" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index f93e69a612833..7b2509741735a 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS_FS tristate "NTFS file system support" + select BUFFER_HEAD select NLS help NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig index 96cc236f7f7bd..cdfdf51e55d79 100644 --- a/fs/ntfs3/Kconfig +++ b/fs/ntfs3/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS3_FS tristate "NTFS Read-Write file system support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 3123da7cfb301..2514d36cbe015 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -2,6 +2,7 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on INET && SYSFS && CONFIGFS_FS + select BUFFER_HEAD select JBD2 select CRC32 select QUOTA diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig index 42b2ec35a05bf..8470f6c3e64e6 100644 --- a/fs/omfs/Kconfig +++ b/fs/omfs/Kconfig @@ -2,6 +2,7 @@ config OMFS_FS tristate "SonicBlue Optimized MPEG File System support" depends on BLOCK + select BUFFER_HEAD select CRC_ITU_T help This is the proprietary file system used by the Rio Karma music diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig index 45b5b98376c43..a2eb826e76c60 100644 --- a/fs/qnx4/Kconfig +++ b/fs/qnx4/Kconfig @@ -2,6 +2,7 @@ config QNX4FS_FS tristate "QNX4 file system support (read only)" depends on BLOCK + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 4 and QNX 6 (the latter is also called QNX RTP). diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig index 6a9d6bce15862..8e865d72204e7 100644 --- a/fs/qnx6/Kconfig +++ b/fs/qnx6/Kconfig @@ -2,6 +2,7 @@ config QNX6FS_FS tristate "QNX6 file system support (read only)" depends on BLOCK && CRC32 + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 6 (also called QNX RTP). diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 4d22ecfe0fab6..0e6fe26458fed 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config REISERFS_FS tristate "Reiserfs support (deprecated)" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index b4e23e03fbeba..67b3f90afbfd6 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -2,6 +2,7 @@ config SYSV_FS tristate "System V/Xenix/V7/Coherent file system support" depends on BLOCK + select BUFFER_HEAD help SCO, Xenix and Coherent are commercial Unix systems for Intel machines, and Version 7 was used on the DEC PDP-11. Saying Y diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 82e8bfa2dfd98..8f7ce30d47fdc 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config UDF_FS tristate "UDF file system support" + select BUFFER_HEAD select CRC_ITU_T select NLS select LEGACY_DIRECT_IO diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index 6d30adb6b890f..9301e7ecd0921 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -2,6 +2,7 @@ config UFS_FS tristate "UFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, OpenBSD and NeXTstep) use a file system called UFS. Some System V diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 7002a9ff63a3d..c89ef50d5112f 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -16,8 +16,6 @@ #include #include -#ifdef CONFIG_BLOCK - enum bh_state_bits { BH_Uptodate, /* Contains valid data */ BH_Dirty, /* Is dirty */ @@ -198,7 +196,6 @@ void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); -bool try_to_free_buffers(struct folio *); struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, bool retry); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, @@ -213,10 +210,6 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); -int inode_has_buffers(struct inode *); -void invalidate_inode_buffers(struct inode *); -int remove_inode_buffers(struct inode *inode); -int sync_mapping_buffers(struct address_space *mapping); int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, bool datasync); int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, @@ -240,9 +233,6 @@ void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); -void invalidate_bh_lrus(void); -void invalidate_bh_lrus_cpu(void); -bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); @@ -258,8 +248,6 @@ int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); void __bh_read_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags, bool force_lock); -extern int buffer_heads_over_limit; - /* * Generic address_space_operations implementations for buffer_head-backed * address_spaces. @@ -304,8 +292,6 @@ extern int buffer_migrate_folio_norefs(struct address_space *, #define buffer_migrate_folio_norefs NULL #endif -void buffer_init(void); - /* * inline definitions */ @@ -465,7 +451,20 @@ __bread(struct block_device *bdev, sector_t block, unsigned size) bool block_dirty_folio(struct address_space *mapping, struct folio *folio); -#else /* CONFIG_BLOCK */ +#ifdef CONFIG_BUFFER_HEAD + +void buffer_init(void); +bool try_to_free_buffers(struct folio *folio); +int inode_has_buffers(struct inode *inode); +void invalidate_inode_buffers(struct inode *inode); +int remove_inode_buffers(struct inode *inode); +int sync_mapping_buffers(struct address_space *mapping); +void invalidate_bh_lrus(void); +void invalidate_bh_lrus_cpu(void); +bool has_bh_in_lru(int cpu, void *dummy); +extern int buffer_heads_over_limit; + +#else /* CONFIG_BUFFER_HEAD */ static inline void buffer_init(void) {} static inline bool try_to_free_buffers(struct folio *folio) { return true; } @@ -473,9 +472,10 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } +static inline void invalidate_bh_lrus(void) {} static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 -#endif /* CONFIG_BLOCK */ +#endif /* CONFIG_BUFFER_HEAD */ #endif /* _LINUX_BUFFER_HEAD_H */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e2b836c2e119a..54f50d34fd9d4 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -58,7 +58,11 @@ struct vm_fault; #define IOMAP_F_DIRTY (1U << 1) #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) +#ifdef CONFIG_BUFFER_HEAD #define IOMAP_F_BUFFER_HEAD (1U << 4) +#else +#define IOMAP_F_BUFFER_HEAD 0 +#endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) /* diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 40e60c33cc6f3..0e128ad514601 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -12,6 +12,7 @@ #define RWBS_LEN 8 +#ifdef CONFIG_BUFFER_HEAD DECLARE_EVENT_CLASS(block_buffer, TP_PROTO(struct buffer_head *bh), @@ -61,6 +62,7 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_ARGS(bh) ); +#endif /* CONFIG_BUFFER_HEAD */ /** * block_rq_requeue - place block IO request back on a queue diff --git a/mm/migrate.c b/mm/migrate.c index 24baad2571e31..fe6f8d454aff8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst, } EXPORT_SYMBOL(migrate_folio); -#ifdef CONFIG_BLOCK +#ifdef CONFIG_BUFFER_HEAD /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, enum migrate_mode mode) @@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping, return __buffer_migrate_folio(mapping, dst, src, mode, true); } EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); -#endif +#endif /* CONFIG_BUFFER_HEAD */ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) From a24c8b5111a139b54fd02fec23bc21789135cd2e Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 3 Aug 2023 21:07:38 +0800 Subject: [PATCH 065/113] fs/Kconfig: Fix compile error for romfs There are some compile errors reported by kernel test robot: arm-linux-gnueabi-ld: fs/romfs/storage.o: in function `romfs_dev_read': storage.c:(.text+0x64): undefined reference to `__brelse' arm-linux-gnueabi-ld: storage.c:(.text+0x9c): undefined reference to `__bread_gfp' arm-linux-gnueabi-ld: fs/romfs/storage.o: in function `romfs_dev_strnlen': storage.c:(.text+0x128): undefined reference to `__brelse' arm-linux-gnueabi-ld: storage.c:(.text+0x16c): undefined reference to `__bread_gfp' arm-linux-gnueabi-ld: fs/romfs/storage.o: in function `romfs_dev_strcmp': storage.c:(.text+0x22c): undefined reference to `__bread_gfp' arm-linux-gnueabi-ld: storage.c:(.text+0x27c): undefined reference to `__brelse' arm-linux-gnueabi-ld: storage.c:(.text+0x2a8): undefined reference to `__bread_gfp' arm-linux-gnueabi-ld: storage.c:(.text+0x2bc): undefined reference to `__brelse' arm-linux-gnueabi-ld: storage.c:(.text+0x2d4): undefined reference to `__brelse' arm-linux-gnueabi-ld: storage.c:(.text+0x2f4): undefined reference to `__brelse' arm-linux-gnueabi-ld: storage.c:(.text+0x304): undefined reference to `__brelse' The reason for the problem is that the commit "925c86a19bac" ("fs: add CONFIG_BUFFER_HEAD") has added a new config "CONFIG_BUFFER_HEAD" that controls building the buffer_head code, and romfs needs to use the buffer_head API, but no corresponding config has beed added. Select the config "CONFIG_BUFFER_HEAD" in romfs Kconfig to resolve the problem. Fixes: 925c86a19bac ("fs: add CONFIG_BUFFER_HEAD") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308031810.pQzGmR1v-lkp@intel.com/ Reviewed-by: Luis Chamberlain Tested-by: Li Zetao Signed-off-by: Li Zetao [axboe: fold in Christoph's incremental] Signed-off-by: Jens Axboe --- fs/romfs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index 8eb87008b55ac..f24a96a331af1 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -57,6 +57,7 @@ endchoice config ROMFS_ON_BLOCK bool default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH + select BUFFER_HEAD config ROMFS_ON_MTD bool From 2eae9c4912b6cfdfadcd4fa8ac26879e18a504a1 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 4 Aug 2023 14:50:37 +0800 Subject: [PATCH 066/113] iocost_monitor: fix kernel queue kobj changes When I use iocost_monitor on nvme0n1, this error shows up: "Could not find ioc for nvme0n1" There is no kobj in struct queue in recent kernel, it seems that the commit 2bd85221a625 ("block: untangle request_queue refcounting from sysfs") move the queue kobj to struct gendisk. Fix it by using mq_kobj which is at the same level with queue kobj. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/r/20230804065039.8885-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- tools/cgroup/iocost_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index 0dbbc67400fcb..7aa076cb559e0 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -221,7 +221,7 @@ def table_row_str(self, path): for i, ptr in radix_tree_for_each(blkcg_root.blkg_tree.address_of_()): blkg = drgn.Object(prog, 'struct blkcg_gq', address=ptr) try: - if devname == blkg.q.kobj.parent.name.string_().decode('utf-8'): + if devname == blkg.q.mq_kobj.parent.name.string_().decode('utf-8'): q_id = blkg.q.id.value_() if blkg.pd[plid]: root_iocg = container_of(blkg.pd[plid], 'struct ioc_gq', 'pd') From 8e93c1acd15e6a754c19ef12f6e69641f37e267a Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 4 Aug 2023 14:50:38 +0800 Subject: [PATCH 067/113] iocost_monitor: print vrate inuse along with base_vrate The real vrate iocost inuse is not base_vrate, but the atomic vtime_rate. We need iocost_monitor tool to display this real vrate that iocost use, to check if the boosted compensated vrate is normal. Effect after change: nvme0n1 RUN per=50.0ms cur_per=172116.580:v1040587.433 busy= +0 \ vrate=135.00%:270.00% params=ssd_dfl(CQ) ^ | this is real vrate inuse Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/r/20230804065039.8885-2-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- tools/cgroup/iocost_monitor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index 7aa076cb559e0..52ae9d1595b21 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -100,6 +100,7 @@ def __init__(self, ioc): self.period_at = ioc.period_at.value_() / 1_000_000 self.vperiod_at = ioc.period_at_vtime.value_() / VTIME_PER_SEC self.vrate_pct = ioc.vtime_base_rate.value_() * 100 / VTIME_PER_USEC + self.ivrate_pct = ioc.vtime_rate.counter.value_() * 100 / VTIME_PER_USEC self.busy_level = ioc.busy_level.value_() self.autop_idx = ioc.autop_idx.value_() self.user_cost_model = ioc.user_cost_model.value_() @@ -119,7 +120,9 @@ def dict(self, now): 'period_at' : self.period_at, 'period_vtime_at' : self.vperiod_at, 'busy_level' : self.busy_level, - 'vrate_pct' : self.vrate_pct, } + 'vrate_pct' : self.vrate_pct, + 'ivrate_pct' : self.ivrate_pct, + } def table_preamble_str(self): state = ('RUN' if self.running else 'IDLE') if self.enabled else 'OFF' @@ -127,7 +130,7 @@ def table_preamble_str(self): f'per={self.period_ms}ms ' \ f'cur_per={self.period_at:.3f}:v{self.vperiod_at:.3f} ' \ f'busy={self.busy_level:+3} ' \ - f'vrate={self.vrate_pct:6.2f}% ' \ + f'vrate={self.vrate_pct:6.2f}%:{self.ivrate_pct:6.2f}% ' \ f'params={self.autop_name}' if self.user_cost_model or self.user_qos_params: output += f'({"C" if self.user_cost_model else ""}{"Q" if self.user_qos_params else ""})' From 68392b002023cb6dadd3d5044268470a7201b313 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 4 Aug 2023 14:50:39 +0800 Subject: [PATCH 068/113] iocost_monitor: improve it by adding iocg wait_ms The iocg can have three throttled metrics: wait, debt, delay. This patch add missing wait_ms to IocgStat to show the latest wait_ms of iocg. As we are here, group iocg usage percents "inflt%" and "usage%" together, and group iocg throttled metrics "wait", "debt" and "delay" together. Effect after changes: nvme0n1 RUN per=50.0ms cur_per=177105.713:v1053528.587 busy= +0 vrate=135.00%:270.00% params=ssd_dfl(CQ) active weight hweight% inflt% usage% wait debt delay InterfererGroup0 * 100/ 100 54.28/ 9.09 0.34 24.07 0.00 0.00 0.00 interfered * 84/ 1000 45.72/ 90.91 0.48 41.09 0.00 0.00 0.00 Signed-off-by: Chengming Zhou Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230804065039.8885-3-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- tools/cgroup/iocost_monitor.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index 52ae9d1595b21..933c750b319b7 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -138,7 +138,7 @@ def table_preamble_str(self): def table_header_str(self): return f'{"":25} active {"weight":>9} {"hweight%":>13} {"inflt%":>6} ' \ - f'{"debt":>7} {"delay":>7} {"usage%"}' + f'{"usage%":>6} {"wait":>7} {"debt":>7} {"delay":>7}' class IocgStat: def __init__(self, iocg): @@ -164,6 +164,8 @@ def __init__(self, iocg): self.usage = (100 * iocg.usage_delta_us.value_() / ioc.period_us.value_()) if self.active else 0 + self.wait_ms = (iocg.stat.wait_us.value_() - + iocg.last_stat.wait_us.value_()) / 1000 self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 if blkg.use_delay.counter.value_() != 0: self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 @@ -180,9 +182,10 @@ def dict(self, now, path): 'hweight_active_pct' : self.hwa_pct, 'hweight_inuse_pct' : self.hwi_pct, 'inflight_pct' : self.inflight_pct, + 'usage_pct' : self.usage, + 'wait_ms' : self.wait_ms, 'debt_ms' : self.debt_ms, 'delay_ms' : self.delay_ms, - 'usage_pct' : self.usage, 'address' : self.address } return out @@ -192,9 +195,10 @@ def table_row_str(self, path): f'{round(self.inuse):5}/{round(self.active):5} ' \ f'{self.hwi_pct:6.2f}/{self.hwa_pct:6.2f} ' \ f'{self.inflight_pct:6.2f} ' \ + f'{min(self.usage, 999):6.2f} ' \ + f'{self.wait_ms:7.2f} ' \ f'{self.debt_ms:7.2f} ' \ - f'{self.delay_ms:7.2f} '\ - f'{min(self.usage, 999):6.2f}' + f'{self.delay_ms:7.2f}' out = out.rstrip(':') return out From 9d4ed6d46272bb60036a6539aae74eafcbbb3d2d Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Fri, 4 Aug 2023 13:46:08 +0200 Subject: [PATCH 069/113] ublk: add helper to check if device supports user copy This will be used by ublk zoned storage support. Signed-off-by: Andreas Hindborg Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230804114610.179530-2-nmi@metaspace.dk Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1c823750c95af..8d271901efac6 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -185,6 +185,11 @@ struct ublk_params_header { __u32 types; }; +static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + static inline void __ublk_complete_rq(struct request *req); static void ublk_complete_rq(struct kref *ref); @@ -2037,7 +2042,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) UBLK_F_URING_CMD_COMP_IN_TASK; /* GET_DATA isn't needed any more with USER_COPY */ - if (ub->dev_info.flags & UBLK_F_USER_COPY) + if (ublk_dev_is_user_copy(ub)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; /* We are not ready to support zero copy */ From 1a6e88b9593b63ccdfe1d84e3f99dd91e4f8d490 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Fri, 4 Aug 2023 13:46:09 +0200 Subject: [PATCH 070/113] ublk: move check for empty address field on command submission In preparation for zoned storage support, move the check for empty `addr` field into the command handler case statement. Note that the check makes no sense for `UBLK_IO_NEED_GET_DATA` because the `addr` field must always be set for this command. Signed-off-by: Andreas Hindborg Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230804114610.179530-3-nmi@metaspace.dk Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8d271901efac6..0e38475796106 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1419,11 +1419,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; - if (ublk_support_user_copy(ubq) && ub_cmd->addr) { - ret = -EINVAL; - goto out; - } - ret = ublk_check_cmd_op(cmd_op); if (ret) goto out; @@ -1450,6 +1445,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, */ if (!ub_cmd->addr && !ublk_need_get_data(ubq)) goto out; + } else if (ub_cmd->addr) { + /* User copy requires addr to be unset */ + ret = -EINVAL; + goto out; } ublk_fill_io_cmd(io, cmd, ub_cmd->addr); @@ -1469,7 +1468,12 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) goto out; + } else if (ub_cmd->addr) { + /* User copy requires addr to be unset */ + ret = -EINVAL; + goto out; } + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_commit_completion(ub, ub_cmd); break; From 29802d7ca33bc0a75c9da2a143eeed4f9e99fca4 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Fri, 4 Aug 2023 13:46:10 +0200 Subject: [PATCH 071/113] ublk: enable zoned storage support Add zoned storage support to ublk: report_zones and operations: - REQ_OP_ZONE_OPEN - REQ_OP_ZONE_CLOSE - REQ_OP_ZONE_FINISH - REQ_OP_ZONE_RESET - REQ_OP_ZONE_APPEND The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to communicate ALBA back to the kernel. Therefore ublk must be used with the user copy feature (UBLK_F_USER_COPY) for zoned storage support to be available. Without this feature, ublk will not allow zoned storage support. Signed-off-by: Andreas Hindborg Reviewed-by: Ming Lei Tested-by: Ming Lei Link: https://lore.kernel.org/r/20230804114610.179530-4-nmi@metaspace.dk Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 331 ++++++++++++++++++++++++++++++++-- include/uapi/linux/ublk_cmd.h | 63 ++++++- 2 files changed, 370 insertions(+), 24 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0e38475796106..b60394fe7be69 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -56,16 +56,21 @@ | UBLK_F_USER_RECOVERY_REISSUE \ | UBLK_F_UNPRIVILEGED_DEV \ | UBLK_F_CMD_IOCTL_ENCODE \ - | UBLK_F_USER_COPY) + | UBLK_F_USER_COPY \ + | UBLK_F_ZONED) /* All UBLK_PARAM_TYPE_* should be included here */ -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT) +#define UBLK_PARAM_TYPE_ALL \ + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) struct ublk_rq_data { struct llist_node node; struct kref ref; + __u64 sector; + __u32 operation; + __u32 nr_zones; }; struct ublk_uring_cmd_pdu { @@ -185,11 +190,263 @@ struct ublk_params_header { __u32 types; }; +static inline unsigned int ublk_req_build_flags(struct request *req); +static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, + int tag); + static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) { return ub->dev_info.flags & UBLK_F_USER_COPY; } +static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_ZONED; +} + +static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_ZONED; +} + +#ifdef CONFIG_BLK_DEV_ZONED + +static int ublk_get_nr_zones(const struct ublk_device *ub) +{ + const struct ublk_param_basic *p = &ub->params.basic; + + /* Zone size is a power of 2 */ + return p->dev_sectors >> ilog2(p->chunk_sectors); +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return blk_revalidate_disk_zones(ub->ub_disk, NULL); +} + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + const struct ublk_param_zoned *p = &ub->params.zoned; + int nr_zones; + + if (!ublk_dev_is_zoned(ub)) + return -EINVAL; + + if (!p->max_zone_append_sectors) + return -EINVAL; + + nr_zones = ublk_get_nr_zones(ub); + + if (p->max_active_zones > nr_zones) + return -EINVAL; + + if (p->max_open_zones > nr_zones) + return -EINVAL; + + return 0; +} + +static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ + const struct ublk_param_zoned *p = &ub->params.zoned; + + disk_set_zoned(ub->ub_disk, BLK_ZONED_HM); + blk_queue_required_elevator_features(ub->ub_disk->queue, + ELEVATOR_F_ZBD_SEQ_WRITE); + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones); + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones); + blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors); + + ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); + + return 0; +} + +/* Based on virtblk_alloc_report_buffer */ +static void *ublk_alloc_report_buffer(struct ublk_device *ublk, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ublk->ub_disk->queue; + size_t bufsize; + void *buf; + + nr_zones = min_t(unsigned int, nr_zones, + ublk->ub_disk->nr_zones); + + bufsize = nr_zones * sizeof(struct blk_zone); + bufsize = + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); + + while (bufsize >= sizeof(struct blk_zone)) { + buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + + *buflen = 0; + return NULL; +} + +static int ublk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct ublk_device *ub = disk->private_data; + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; + unsigned int first_zone = sector >> ilog2(zone_size_sectors); + unsigned int done_zones = 0; + unsigned int max_zones_per_request; + int ret; + struct blk_zone *buffer; + size_t buffer_length; + + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone, + nr_zones); + + buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); + if (!buffer) + return -ENOMEM; + + max_zones_per_request = buffer_length / sizeof(struct blk_zone); + + while (done_zones < nr_zones) { + unsigned int remaining_zones = nr_zones - done_zones; + unsigned int zones_in_request = + min_t(unsigned int, remaining_zones, max_zones_per_request); + struct request *req; + struct ublk_rq_data *pdu; + blk_status_t status; + + memset(buffer, 0, buffer_length); + + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + pdu = blk_mq_rq_to_pdu(req); + pdu->operation = UBLK_IO_OP_REPORT_ZONES; + pdu->sector = sector; + pdu->nr_zones = zones_in_request; + + ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, + GFP_KERNEL); + if (ret) { + blk_mq_free_request(req); + goto out; + } + + status = blk_execute_rq(req, 0); + ret = blk_status_to_errno(status); + blk_mq_free_request(req); + if (ret) + goto out; + + for (unsigned int i = 0; i < zones_in_request; i++) { + struct blk_zone *zone = buffer + i; + + /* A zero length zone means no more zones in this response */ + if (!zone->len) + break; + + ret = cb(zone, i, data); + if (ret) + goto out; + + done_zones++; + sector += zone_size_sectors; + + } + } + + ret = done_zones; + +out: + kvfree(buffer); + return ret; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); + struct ublk_io *io = &ubq->ios[req->tag]; + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); + u32 ublk_op; + + switch (req_op(req)) { + case REQ_OP_ZONE_OPEN: + ublk_op = UBLK_IO_OP_ZONE_OPEN; + break; + case REQ_OP_ZONE_CLOSE: + ublk_op = UBLK_IO_OP_ZONE_CLOSE; + break; + case REQ_OP_ZONE_FINISH: + ublk_op = UBLK_IO_OP_ZONE_FINISH; + break; + case REQ_OP_ZONE_RESET: + ublk_op = UBLK_IO_OP_ZONE_RESET; + break; + case REQ_OP_ZONE_APPEND: + ublk_op = UBLK_IO_OP_ZONE_APPEND; + break; + case REQ_OP_DRV_IN: + ublk_op = pdu->operation; + switch (ublk_op) { + case UBLK_IO_OP_REPORT_ZONES: + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_zones = pdu->nr_zones; + iod->start_sector = pdu->sector; + return BLK_STS_OK; + default: + return BLK_STS_IOERR; + } + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_DRV_OUT: + /* We do not support reset_all and drv_out */ + return BLK_STS_NOTSUPP; + default: + return BLK_STS_IOERR; + } + + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_sectors = blk_rq_sectors(req); + iod->start_sector = blk_rq_pos(req); + iod->addr = io->addr; + + return BLK_STS_OK; +} + +#else + +#define ublk_report_zones (NULL) + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + return -EOPNOTSUPP; +} + +static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ + return -EOPNOTSUPP; +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return 0; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + return -EOPNOTSUPP; +} + +#endif + static inline void __ublk_complete_rq(struct request *req); static void ublk_complete_rq(struct kref *ref); @@ -286,6 +543,9 @@ static int ublk_validate_params(const struct ublk_device *ub) if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) return -EINVAL; + + if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) + return -EINVAL; } else return -EINVAL; @@ -304,6 +564,11 @@ static int ublk_validate_params(const struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_DEVT) return -EINVAL; + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + return ublk_dev_param_zoned_validate(ub); + else if (ublk_dev_is_zoned(ub)) + return -EINVAL; + return 0; } @@ -317,6 +582,9 @@ static int ublk_apply_params(struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) ublk_dev_param_discard_apply(ub); + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + return ublk_dev_param_zoned_apply(ub); + return 0; } @@ -487,6 +755,7 @@ static const struct block_device_operations ub_fops = { .owner = THIS_MODULE, .open = ublk_open, .free_disk = ublk_free_disk, + .report_zones = ublk_report_zones, }; #define UBLK_MAX_PIN_PAGES 32 @@ -601,7 +870,8 @@ static inline bool ublk_need_map_req(const struct request *req) static inline bool ublk_need_unmap_req(const struct request *req) { - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ; + return ublk_rq_has_data(req) && + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); } static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, @@ -685,8 +955,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; + enum req_op op = req_op(req); u32 ublk_op; + if (!ublk_queue_is_zoned(ubq) && + (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) + return -EIO; + switch (req_op(req)) { case REQ_OP_READ: ublk_op = UBLK_IO_OP_READ; @@ -704,6 +979,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) ublk_op = UBLK_IO_OP_WRITE_ZEROES; break; default: + if (ublk_queue_is_zoned(ubq)) + return ublk_setup_iod_zoned(ubq, req); return BLK_STS_IOERR; } @@ -756,7 +1033,8 @@ static inline void __ublk_complete_rq(struct request *req) * * Both the two needn't unmap. */ - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && + req_op(req) != REQ_OP_DRV_IN) goto exit; /* for READ request, writing data in iod->addr to rq buffers */ @@ -1120,6 +1398,9 @@ static void ublk_commit_completion(struct ublk_device *ub, /* find the io request and complete */ req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ub_cmd->zone_append_lba; + if (req && likely(!blk_should_fake_timeout(req->q))) ublk_put_req_ref(ubq, req); } @@ -1468,8 +1749,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) goto out; - } else if (ub_cmd->addr) { - /* User copy requires addr to be unset */ + } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { + /* + * User copy requires addr to be unset when command is + * not zone append + */ ret = -EINVAL; goto out; } @@ -1546,11 +1830,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req, int ubuf_dir) { /* copy ubuf to request pages */ - if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE) + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && + ubuf_dir == ITER_SOURCE) return true; /* copy request pages to ubuf */ - if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST) + if ((req_op(req) == REQ_OP_WRITE || + req_op(req) == REQ_OP_ZONE_APPEND) && + ubuf_dir == ITER_DEST) return true; return false; @@ -1889,17 +2176,24 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) get_device(&ub->cdev_dev); ub->dev_info.state = UBLK_S_DEV_LIVE; + + if (ublk_dev_is_zoned(ub)) { + ret = ublk_revalidate_disk_zones(ub); + if (ret) + goto out_put_cdev; + } + ret = add_disk(disk); + if (ret) + goto out_put_cdev; + + set_bit(UB_STATE_USED, &ub->state); + +out_put_cdev: if (ret) { - /* - * Has to drop the reference since ->free_disk won't be - * called in case of add_disk failure. - */ ub->dev_info.state = UBLK_S_DEV_DEAD; ublk_put_device(ub); - goto out_put_disk; } - set_bit(UB_STATE_USED, &ub->state); out_put_disk: if (ret) put_disk(disk); @@ -2049,6 +2343,13 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) if (ublk_dev_is_user_copy(ub)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* Zoned storage support requires user copy feature */ + if (ublk_dev_is_zoned(ub) && + (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) { + ret = -EINVAL; + goto out_free_dev_number; + } + /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 4b8558db90e1e..2685e53e47521 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -176,6 +176,12 @@ /* Copy between request and user buffer by pread()/pwrite() */ #define UBLK_F_USER_COPY (1UL << 7) +/* + * User space sets this flag when setting up the device to request zoned storage support. Kernel may + * deny the request by returning an error. + */ +#define UBLK_F_ZONED (1ULL << 8) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -232,9 +238,26 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_OP_READ 0 #define UBLK_IO_OP_WRITE 1 #define UBLK_IO_OP_FLUSH 2 -#define UBLK_IO_OP_DISCARD 3 -#define UBLK_IO_OP_WRITE_SAME 4 -#define UBLK_IO_OP_WRITE_ZEROES 5 +#define UBLK_IO_OP_DISCARD 3 +#define UBLK_IO_OP_WRITE_SAME 4 +#define UBLK_IO_OP_WRITE_ZEROES 5 +#define UBLK_IO_OP_ZONE_OPEN 10 +#define UBLK_IO_OP_ZONE_CLOSE 11 +#define UBLK_IO_OP_ZONE_FINISH 12 +#define UBLK_IO_OP_ZONE_APPEND 13 +#define UBLK_IO_OP_ZONE_RESET 15 +/* + * Construct a zone report. The report request is carried in `struct + * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone + * and shall indicate the first zone of the report. The `nr_zones` shall + * indicate how many zones should be reported at most. The report shall be + * delivered as a `struct blk_zone` array. To report fewer zones than requested, + * zero the last entry of the returned array. + * + * Related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in + * include/uapi/linux/blkzoned.h are part of ublk UAPI. + */ +#define UBLK_IO_OP_REPORT_ZONES 18 #define UBLK_IO_F_FAILFAST_DEV (1U << 8) #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9) @@ -255,7 +278,10 @@ struct ublksrv_io_desc { /* op: bit 0-7, flags: bit 8-31 */ __u32 op_flags; - __u32 nr_sectors; + union { + __u32 nr_sectors; + __u32 nr_zones; /* for UBLK_IO_OP_REPORT_ZONES */ + }; /* start sector for this io */ __u64 start_sector; @@ -284,11 +310,21 @@ struct ublksrv_io_cmd { /* io result, it is valid for COMMIT* command only */ __s32 result; - /* - * userspace buffer address in ublksrv daemon process, valid for - * FETCH* command only - */ - __u64 addr; + union { + /* + * userspace buffer address in ublksrv daemon process, valid for + * FETCH* command only + * + * `addr` should not be used when UBLK_F_USER_COPY is enabled, + * because userspace handles data copy by pread()/pwrite() over + * /dev/ublkcN. But in case of UBLK_F_ZONED, this union is + * re-used to pass back the allocated LBA for + * UBLK_IO_OP_ZONE_APPEND which actually depends on + * UBLK_F_USER_COPY + */ + __u64 addr; + __u64 zone_append_lba; + }; }; struct ublk_param_basic { @@ -331,6 +367,13 @@ struct ublk_param_devt { __u32 disk_minor; }; +struct ublk_param_zoned { + __u32 max_open_zones; + __u32 max_active_zones; + __u32 max_zone_append_sectors; + __u8 reserved[20]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -342,11 +385,13 @@ struct ublk_params { #define UBLK_PARAM_TYPE_BASIC (1 << 0) #define UBLK_PARAM_TYPE_DISCARD (1 << 1) #define UBLK_PARAM_TYPE_DEVT (1 << 2) +#define UBLK_PARAM_TYPE_ZONED (1 << 3) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; struct ublk_param_discard discard; struct ublk_param_devt devt; + struct ublk_param_zoned zoned; }; #endif From d47f9717e5cfd0dd8c0ba2ecfa47c38d140f1bb6 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Thu, 3 Aug 2023 19:12:42 +0800 Subject: [PATCH 072/113] block/mq-deadline: use correct way to throttling write requests The original formula was inaccurate: dd->async_depth = max(1UL, 3 * q->nr_requests / 4); For write requests, when we assign a tags from sched_tags, data->shallow_depth will be passed to sbitmap_find_bit, see the following code: nr = sbitmap_find_bit_in_word(&sb->map[index], min_t (unsigned int, __map_depth(sb, index), depth), alloc_hint, wrap); The smaller of data->shallow_depth and __map_depth(sb, index) will be used as the maximum range when allocating bits. For a mmc device (one hw queue, deadline I/O scheduler): q->nr_requests = sched_tags = 128, so according to the previous calculation method, dd->async_depth = data->shallow_depth = 96, and the platform is 64bits with 8 cpus, sched_tags.bitmap_tags.sb.shift=5, sb.maps[]=32/32/32/32, 32 is smaller than 96, whether it is a read or a write I/O, tags can be allocated to the maximum range each time, which has not throttling effect. In addition, refer to the methods of bfg/kyber I/O scheduler, limit ratiois are calculated base on sched_tags.bitmap_tags.sb.shift. This patch can throttle write requests really. Fixes: 07757588e507 ("block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests") Signed-off-by: Zhiguo Niu Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/1691061162-22898-1-git-send-email-zhiguo.niu@unisoc.com Signed-off-by: Jens Axboe --- block/mq-deadline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 02a916ba62ee7..f958e79277b8b 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -646,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags.sb.shift; - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + dd->async_depth = max(1U, 3 * (1U << shift) / 4); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); } From 7c8998f75d2d42ddefb172239b0f689392958309 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:48:27 +0900 Subject: [PATCH 073/113] block: make bvec_try_merge_hw_page() non-static This will be used for multi-page configuration for integrity payload. Cc: Christoph Hellwig Cc: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803024827epcms2p838d9e9131492c86a159fff25d195658f@epcms2p8 Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index c92dda962449b..8d1533af7c609 100644 --- a/block/bio.c +++ b/block/bio.c @@ -934,7 +934,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, * size limit. This is not for normal read/write bios, but for passthrough * or Zone Append operations that we can't split. */ -static bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset, bool *same_page) { diff --git a/block/blk.h b/block/blk.h index 686712e138352..9d22ec3a53bcf 100644 --- a/block/blk.h +++ b/block/blk.h @@ -75,6 +75,10 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page); + static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { From 80814b8e359f7207595f52702aea432a7bd61200 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:49:56 +0900 Subject: [PATCH 074/113] bio-integrity: update the payload size in bio_integrity_add_page() Previously, the bip's bi_size has been set before an integrity pages were added. If a problem occurs in the process of adding pages for bip, the bi_size mismatch problem must be dealt with. When the page is successfully added to bvec, the bi_size is updated. The parts affected by the change were also contained in this commit. Cc: Christoph Hellwig Cc: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803024956epcms2p38186a17392706650c582d38ef3dbcd32@epcms2p3 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- drivers/md/dm-crypt.c | 1 - drivers/nvme/host/ioctl.c | 1 - drivers/nvme/target/io-cmd-bdev.c | 3 +-- drivers/target/target_core_iblock.c | 3 +-- 5 files changed, 3 insertions(+), 7 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 045553a164e0c..6220a99977a42 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -137,6 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; + bip->bip_iter.bi_size += len; return len; } @@ -244,7 +245,6 @@ bool bio_integrity_prep(struct bio *bio) } bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip->bip_iter.bi_size = len; bip_set_seed(bip, bio->bi_iter.bi_sector); if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1dc6227d353ec..f2662c21a6dfe 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1160,7 +1160,6 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); - bip->bip_iter.bi_size = tag_len; bip->bip_iter.bi_sector = io->cc->start + io->sector; ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 5c3250f36ce77..19a5177bc3604 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -118,7 +118,6 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf, goto out_free_meta; } - bip->bip_iter.bi_size = len; bip->bip_iter.bi_sector = seed; ret = bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)); diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 2733e01585854..468833675cc94 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -206,12 +206,11 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, return PTR_ERR(bip); } - bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); /* virtual start sector must be in integrity interval units */ bip_set_seed(bip, bio->bi_iter.bi_sector >> (bi->interval_exp - SECTOR_SHIFT)); - resid = bip->bip_iter.bi_size; + resid = bio_integrity_bytes(bi, bio_sectors(bio)); while (resid > 0 && sg_miter_next(miter)) { len = min_t(size_t, miter->length, resid); rc = bio_integrity_add_page(bio, miter->page, len, diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 3d1b511ea284b..a7050f63b7cc1 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -689,7 +689,6 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio, return PTR_ERR(bip); } - bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); /* virtual start sector must be in integrity interval units */ bip_set_seed(bip, bio->bi_iter.bi_sector >> (bi->interval_exp - SECTOR_SHIFT)); @@ -697,7 +696,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio, pr_debug("IBLOCK BIP Size: %u Sector: %llu\n", bip->bip_iter.bi_size, (unsigned long long)bip->bip_iter.bi_sector); - resid = bip->bip_iter.bi_size; + resid = bio_integrity_bytes(bi, bio_sectors(bio)); while (resid > 0 && sg_miter_next(miter)) { len = min_t(size_t, miter->length, resid); From d1f04c2e23c99258049c6081c3147bae69e5bcb8 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:50:58 +0900 Subject: [PATCH 075/113] bio-integrity: cleanup adding integrity pages to bip's bvec. bio_integrity_add_page() returns the add length if successful, else 0, just as bio_add_page. Simply check return value checking in bio_integrity_prep to not deal with a > 0 but < len case that can't happen. Cc: Christoph Hellwig Cc: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803025058epcms2p5a4d0db5da2ad967668932d463661c633@epcms2p5 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6220a99977a42..c6b3bc86e1f9e 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -252,27 +252,18 @@ bool bio_integrity_prep(struct bio *bio) /* Map it */ offset = offset_in_page(buf); - for (i = 0 ; i < nr_pages ; i++) { - int ret; + for (i = 0; i < nr_pages && len > 0; i++) { bytes = PAGE_SIZE - offset; - if (len <= 0) - break; - if (bytes > len) bytes = len; - ret = bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset); - - if (ret == 0) { + if (bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset) < bytes) { printk(KERN_ERR "could not attach integrity payload\n"); goto err_end_io; } - if (ret < bytes) - break; - buf += bytes; len -= bytes; offset = 0; @@ -291,7 +282,6 @@ bool bio_integrity_prep(struct bio *bio) bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; - } EXPORT_SYMBOL(bio_integrity_prep); From 0ece1d649b6dd615925a72bc1824d6b9fa5b998a Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:52:02 +0900 Subject: [PATCH 076/113] bio-integrity: create multi-page bvecs in bio_integrity_add_page() In general, the bvec data structure consists of one for physically continuous pages. But, in the bvec configuration for bip, physically continuous integrity pages are composed of each bvec. Allow bio_integrity_add_page() to create multi-page bvecs, just like the bio payloads. This simplifies adding larger payloads, and fixes support for non-tiny workloads with nvme, which stopped using scatterlist for metadata a while ago. Cc: Christoph Hellwig Cc: Martin K. Petersen Fixes: 783b94bd9250 ("nvme-pci: do not build a scatterlist to map metadata") Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803025202epcms2p82f57cbfe32195da38c776377b55aed59@epcms2p8 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index c6b3bc86e1f9e..ec8ac8cf6e1b9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -123,17 +123,34 @@ void bio_integrity_free(struct bio *bio) int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_integrity_payload *bip = bio_integrity(bio); - if (bip->bip_vcnt >= bip->bip_max_vcnt) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); + if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) > + queue_max_hw_sectors(q)) return 0; - } - if (bip->bip_vcnt && - bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, - &bip->bip_vec[bip->bip_vcnt - 1], offset)) - return 0; + if (bip->bip_vcnt > 0) { + struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; + bool same_page = false; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, + &same_page)) { + bip->bip_iter.bi_size += len; + return len; + } + + if (bip->bip_vcnt >= + min(bip->bip_max_vcnt, queue_max_integrity_segments(q))) + return 0; + + /* + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. + */ + if (bvec_gap_to_prev(&q->limits, bv, offset)) + return 0; + } bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; From c8659bbb15cd42577a9b16a23b527436b028c8b2 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 10 Aug 2023 16:48:36 +0800 Subject: [PATCH 077/113] ublk: Fix signedness bug returning warning There are two warnings reported by smatch: drivers/block/ublk_drv.c:445 ublk_setup_iod_zoned() warn: signedness bug returning '(-95)' drivers/block/ublk_drv.c:963 ublk_setup_iod() warn: signedness bug returning '(-5)' The type of "blk_status_t" is either be a u32 or u8, but this two functions return a negative value when not supported or failed. Use the error code of the blk module to fix these warnings. Fixes: 29802d7ca33b ("ublk: enable zoned storage support") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202308100201.TCRhgdvN-lkp@intel.com/ Signed-off-by: Li Zetao Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230810084836.3535322-1-lizetao1@huawei.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b60394fe7be69..109a5b17537d2 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -442,7 +442,7 @@ static int ublk_revalidate_disk_zones(struct ublk_device *ub) static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, struct request *req) { - return -EOPNOTSUPP; + return BLK_STS_NOTSUPP; } #endif @@ -960,7 +960,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) if (!ublk_queue_is_zoned(ubq) && (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) - return -EIO; + return BLK_STS_IOERR; switch (req_op(req)) { case REQ_OP_READ: From 4eb44d10766ac0fae5973998fd2a0103df1d3fe1 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 10 Aug 2023 11:51:11 +0800 Subject: [PATCH 078/113] block: remove init_mutex and open-code blk_iolatency_try_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a13696b83da4 ("blk-iolatency: Make initialization lazy") adds a mutex named "init_mutex" in blk_iolatency_try_init for the race condition of initializing RQ_QOS_LATENCY. Now a new lock has been add to struct request_queue by commit a13bd91be223 ("block/rq_qos: protect rq_qos apis with a new lock"). And it has been held in blkg_conf_open_bdev before calling blk_iolatency_init. So it's not necessary to keep init_mutex in blk_iolatency_try_init, just remove it. Since init_mutex has been removed, blk_iolatency_try_init can be open-coded back to iolatency_set_limit() like ioc_qos_write(). Signed-off-by: Li Lingfeng Reviewed-by: Michal Koutný Link: https://lore.kernel.org/r/20230810035111.2236335-1-lilingfeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index fd5fec989e390..c16aef4be0363 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -824,29 +824,6 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg) } } -static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx) -{ - static DEFINE_MUTEX(init_mutex); - int ret; - - ret = blkg_conf_open_bdev(ctx); - if (ret) - return ret; - - /* - * blk_iolatency_init() may fail after rq_qos_add() succeeds which can - * confuse iolat_rq_qos() test. Make the test and init atomic. - */ - mutex_lock(&init_mutex); - - if (!iolat_rq_qos(ctx->bdev->bd_queue)) - ret = blk_iolatency_init(ctx->bdev->bd_disk); - - mutex_unlock(&init_mutex); - - return ret; -} - static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -861,7 +838,17 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, blkg_conf_init(&ctx, buf); - ret = blk_iolatency_try_init(&ctx); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out; + + /* + * blk_iolatency_init() may fail after rq_qos_add() succeeds which can + * confuse iolat_rq_qos() test. Make the test and init atomic. + */ + lockdep_assert_held(ctx.bdev->bd_queue->rq_qos_mutex); + if (!iolat_rq_qos(ctx.bdev->bd_queue)) + ret = blk_iolatency_init(ctx.bdev->bd_disk); if (ret) goto out; From d21fed50c523d87af6456697ad09378060c4f09a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 10 Aug 2023 16:19:23 +0200 Subject: [PATCH 079/113] swim3: mark swim3_init() static This is the module init function, which by definition is used only locally, so mark it static to avoid a warning: drivers/block/swim3.c:1280:5: error: no previous prototype for 'swim3_init' [-Werror=missing-prototypes] Reviewed-by: Jack Wang Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- drivers/block/swim3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index dc43a63b34694..c2bc85826358e 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1277,7 +1277,7 @@ static struct macio_driver swim3_driver = }; -int swim3_init(void) +static int swim3_init(void) { macio_register_driver(&swim3_driver); return 0; From 18267a0365d6ec8bbe85ba8cbea5af12d9e59610 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Aug 2023 17:24:53 -0600 Subject: [PATCH 080/113] block: fix bad lockdep annotation in blk-iolatency A previous commit added a lockdep annotation, but botched it. Use the right type. Fixes: 4eb44d10766a ("block: remove init_mutex and open-code blk_iolatency_try_init") Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c16aef4be0363..c1a6aba1d59e4 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -846,7 +846,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, * blk_iolatency_init() may fail after rq_qos_add() succeeds which can * confuse iolat_rq_qos() test. Make the test and init atomic. */ - lockdep_assert_held(ctx.bdev->bd_queue->rq_qos_mutex); + lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex); if (!iolat_rq_qos(ctx.bdev->bd_queue)) ret = blk_iolatency_init(ctx.bdev->bd_disk); if (ret) From e24721e441a7c640e4e7b2b63c23c06d9a750880 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 11 Aug 2023 21:52:16 +0800 Subject: [PATCH 081/113] ublk: fix 'warn: variable dereferenced before check 'req'' from Smatch The added check of 'req_op(req) == REQ_OP_ZONE_APPEND' should have been done after the request is confirmed as valid. Actually here, the request should always been true, so add one WARN_ON_ONCE(!req), meantime move the zone_append check after checking the request. Cc: Andreas Hindborg Reported-by: Dan Carpenter Fixes: 29802d7ca33b ("ublk: enable zoned storage support") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230811135216.420404-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 109a5b17537d2..e85e075b5bce1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1397,11 +1397,13 @@ static void ublk_commit_completion(struct ublk_device *ub, /* find the io request and complete */ req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); + if (WARN_ON_ONCE(unlikely(!req))) + return; if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = ub_cmd->zone_append_lba; - if (req && likely(!blk_should_fake_timeout(req->q))) + if (likely(!blk_should_fake_timeout(req->q))) ublk_put_req_ref(ubq, req); } From 7ba3792718709d410be5d971732b9251cbda67b6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Aug 2023 14:26:34 -0400 Subject: [PATCH 082/113] block: Add some exports for bcachefs - bio_set_pages_dirty(), bio_check_pages_dirty() - dio path - blk_status_to_str() - error messages - bio_add_folio() - this should definitely be exported for everyone, it's the modern version of bio_add_page() Signed-off-by: Kent Overstreet Cc: linux-block@vger.kernel.org Cc: Jens Axboe Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20230813182636.2966159-2-kent.overstreet@linux.dev Signed-off-by: Jens Axboe --- block/bio.c | 2 ++ block/blk-core.c | 1 + block/blk.h | 1 - include/linux/blkdev.h | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 8d1533af7c609..bb3ea4e05d4ca 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1472,6 +1472,7 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(bvec->bv_page); } } +EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. @@ -1531,6 +1532,7 @@ void bio_check_pages_dirty(struct bio *bio) spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } +EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { diff --git a/block/blk-core.c b/block/blk-core.c index 99d8b9812b18f..141e54545cc1d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -208,6 +208,7 @@ const char *blk_status_to_str(blk_status_t status) return ""; return blk_errors[idx].name; } +EXPORT_SYMBOL_GPL(blk_status_to_str); /** * blk_sync_queue - cancel any pending callbacks on a queue diff --git a/block/blk.h b/block/blk.h index 9d22ec3a53bcf..08a358bc0919e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -254,7 +254,6 @@ static inline void bio_integrity_free(struct bio *bio) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -const char *blk_status_to_str(blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2f5371b8482c0..4feed1fc141f6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -847,6 +847,7 @@ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); +const char *blk_status_to_str(blk_status_t status); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) From 168145f617d57bf4e474901b7ffa869337a802e6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Aug 2023 14:26:35 -0400 Subject: [PATCH 083/113] block: Allow bio_iov_iter_get_pages() with bio->bi_bdev unset bio_iov_iter_get_pages() trims the IO based on the block size of the block device the IO will be issued to. However, bcachefs is a multi device filesystem; when we're creating the bio we don't yet know which block device the bio will be submitted to - we have to handle the alignment checks elsewhere. Thus this is needed to avoid a null ptr deref. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: linux-block@vger.kernel.org Link: https://lore.kernel.org/r/20230813182636.2966159-3-kent.overstreet@linux.dev Signed-off-by: Jens Axboe --- block/bio.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/bio.c b/block/bio.c index bb3ea4e05d4ca..be484d87142b8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1231,7 +1231,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size, left; unsigned len, i = 0; - size_t offset, trim; + size_t offset; int ret = 0; /* @@ -1260,10 +1260,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); - iov_iter_revert(iter, trim); + if (bio->bi_bdev) { + size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); + iov_iter_revert(iter, trim); + size -= trim; + } - size -= trim; if (unlikely(!size)) { ret = -EFAULT; goto out; From 649f070e69739d22c57c22dbce0788b72cd93fac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Aug 2023 14:26:36 -0400 Subject: [PATCH 084/113] block: Bring back zero_fill_bio_iter This reverts 6f822e1b5d9dda3d20e87365de138046e3baa03a - this helper is used by bcachefs. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: linux-block@vger.kernel.org Link: https://lore.kernel.org/r/20230813182636.2966159-4-kent.overstreet@linux.dev Signed-off-by: Jens Axboe --- block/bio.c | 6 +++--- include/linux/bio.h | 7 ++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/block/bio.c b/block/bio.c index be484d87142b8..816d412c06e9b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) } EXPORT_SYMBOL(bio_kmalloc); -void zero_fill_bio(struct bio *bio) +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) + __bio_for_each_segment(bv, bio, iter, start) memzero_bvec(&bv); } -EXPORT_SYMBOL(zero_fill_bio); +EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size diff --git a/include/linux/bio.h b/include/linux/bio.h index c4f5b5228105f..8b99210eb7fb0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -488,7 +488,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); -void zero_fill_bio(struct bio *bio); +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); + +static inline void zero_fill_bio(struct bio *bio) +{ + zero_fill_bio_iter(bio, bio->bi_iter); +} static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { From c7b4b23b36edf32239e7fc3b922797ff1d32b072 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 15 Aug 2023 06:58:32 +0900 Subject: [PATCH 085/113] block: uapi: Fix compilation errors using ioprio.h with C++ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of the "class" argument name in the ioprio_value() inline function in include/uapi/linux/ioprio.h confuses C++ compilers resulting in compilation errors such as: /usr/include/linux/ioprio.h:110:43: error: expected primary-expression before ‘int’ 110 | static __always_inline __u16 ioprio_value(int class, int level, int hint) | ^~~ for user C++ programs including linux/ioprio.h. Avoid these errors by renaming the arguments of the ioprio_value() function to prioclass, priolevel and priohint. For consistency, the arguments of the IOPRIO_PRIO_VALUE() and IOPRIO_PRIO_VALUE_HINT() macros are also renamed in the same manner. Reported-by: Igor Pylypiv Fixes: 01584c1e2337 ("scsi: block: Improve ioprio value validity checks") Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Tested-by: Igor Pylypiv Link: https://lore.kernel.org/r/20230814215833.259286-1-dlemoal@kernel.org Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/uapi/linux/ioprio.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 99440b2e8c352..bee2bdb0eedbc 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -107,20 +107,21 @@ enum { /* * Return an I/O priority value based on a class, a level and a hint. */ -static __always_inline __u16 ioprio_value(int class, int level, int hint) +static __always_inline __u16 ioprio_value(int prioclass, int priolevel, + int priohint) { - if (IOPRIO_BAD_VALUE(class, IOPRIO_NR_CLASSES) || - IOPRIO_BAD_VALUE(level, IOPRIO_NR_LEVELS) || - IOPRIO_BAD_VALUE(hint, IOPRIO_NR_HINTS)) + if (IOPRIO_BAD_VALUE(prioclass, IOPRIO_NR_CLASSES) || + IOPRIO_BAD_VALUE(priolevel, IOPRIO_NR_LEVELS) || + IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS)) return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT; - return (class << IOPRIO_CLASS_SHIFT) | - (hint << IOPRIO_HINT_SHIFT) | level; + return (prioclass << IOPRIO_CLASS_SHIFT) | + (priohint << IOPRIO_HINT_SHIFT) | priolevel; } -#define IOPRIO_PRIO_VALUE(class, level) \ - ioprio_value(class, level, IOPRIO_HINT_NONE) -#define IOPRIO_PRIO_VALUE_HINT(class, level, hint) \ - ioprio_value(class, level, hint) +#define IOPRIO_PRIO_VALUE(prioclass, priolevel) \ + ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE) +#define IOPRIO_PRIO_VALUE_HINT(prioclass, priolevel, priohint) \ + ioprio_value(prioclass, priolevel, priohint) #endif /* _UAPI_LINUX_IOPRIO_H */ From 66a6a5d0ec852eaced589da066376e69397cd71e Mon Sep 17 00:00:00 2001 From: Ruan Jinjie Date: Tue, 15 Aug 2023 19:48:14 +0800 Subject: [PATCH 086/113] ublk: Switch to memdup_user_nul() helper Use memdup_user_nul() helper instead of open-coding to simplify the code. Signed-off-by: Ruan Jinjie Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230815114815.1551171-1-ruanjinjie@huawei.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e85e075b5bce1..6ecd728ee5cea 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2742,14 +2742,9 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, if (header->len < header->dev_path_len) return -EINVAL; - dev_path = kmalloc(header->dev_path_len + 1, GFP_KERNEL); - if (!dev_path) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(dev_path, argp, header->dev_path_len)) - goto exit; - dev_path[header->dev_path_len] = 0; + dev_path = memdup_user_nul(argp, header->dev_path_len); + if (IS_ERR(dev_path)) + return PTR_ERR(dev_path); ret = -EINVAL; switch (_IOC_NR(cmd->cmd_op)) { From a705b11b358dee677aad80630e7608b2d5f56691 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 8 Jul 2023 17:17:27 +0800 Subject: [PATCH 087/113] md/raid5-cache: fix a deadlock in r5l_exit_log() Commit b13015af94cf ("md/raid5-cache: Clear conf->log after finishing work") introduce a new problem: // caller hold reconfig_mutex r5l_exit_log flush_work(&log->disable_writeback_work) r5c_disable_writeback_async wait_event /* * conf->log is not NULL, and mddev_trylock() * will fail, wait_event() can never pass. */ conf->log = NULL Fix this problem by setting 'config->log' to NULL before wake_up() as it used to be, so that wait_event() from r5c_disable_writeback_async() can exist. In the meantime, move forward md_unregister_thread() so that null-ptr-deref this commit fixed can still be fixed. Fixes: b13015af94cf ("md/raid5-cache: Clear conf->log after finishing work") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20230708091727.1417894-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 47ba7d9e81e18..2eac4a50d99bd 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -3168,12 +3168,15 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - /* Ensure disable_writeback_work wakes up and exits */ - wake_up(&conf->mddev->sb_wait); - flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); + /* + * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to + * ensure disable_writeback_work wakes up and exits. + */ conf->log = NULL; + wake_up(&conf->mddev->sb_wait); + flush_work(&log->disable_writeback_work); mempool_exit(&log->meta_pool); bioset_exit(&log->bs); From 8b0472b50bcf0f19a5119b00a53b63579c8e1e4d Mon Sep 17 00:00:00 2001 From: Zhang Shurong Date: Sat, 22 Jul 2023 15:53:53 +0800 Subject: [PATCH 088/113] md: raid1: fix potential OOB in raid1_remove_disk() If rddev->raid_disk is greater than mddev->raid_disks, there will be an out-of-bounds in raid1_remove_disk(). We have already found similar reports as follows: 1) commit d17f744e883b ("md-raid10: fix KASAN warning") 2) commit 1ebc2cec0b7d ("dm raid: fix KASAN warning in raid5_remove_disk") Fix this bug by checking whether the "number" variable is valid. Signed-off-by: Zhang Shurong Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/tencent_0D24426FAC6A21B69AC0C03CE4143A508F09@qq.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 23d211969565e..b920f92780139 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1837,6 +1837,10 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; + + if (unlikely(number >= conf->raid_disks)) + goto abort; + struct raid1_info *p = conf->mirrors + number; if (rdev != p->rdev) From 892da88d1cd93426e9c6d7717876ca705fe2b9fa Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 31 Jul 2023 10:28:00 +0800 Subject: [PATCH 089/113] md/raid10: fix a 'conf->barrier' leakage in raid10_takeover() After commit b39f35ebe86d ("md: don't quiesce in mddev_suspend()"), 'conf->barrier' will be leaked in the case that raid10 takeover raid0: level_store pers->takeover -> raid10_takeover raid10_takeover_raid0 WRITE_ONCE(conf->barrier, 1) mddev_suspend // still raid0 mddev->pers = pers // switch to raid10 mddev_resume // resume without suspend After the above commit, mddev_resume() will not decrease 'conf->barrier' that is set in raid10_takeover_raid0(). Fix this problem by not setting 'conf->barrier' in raid10_takeover_raid0(). By the way, this problem is found while I'm trying to make mddev_suspend/resume() to be independent from raid personalities. raid10 is the only personality to use reference count in the quiesce() callback and this problem is only related to raid10. Fixes: b39f35ebe86d ("md: don't quiesce in mddev_suspend()") Signed-off-by: Yu Kuai Reviewed-by: Paul Menzel Link: https://lore.kernel.org/r/20230731022800.1424902-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid10.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 16aa9d735880a..7704a4c7f4695 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4417,7 +4417,6 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - WRITE_ONCE(conf->barrier, 1); } return conf; From 7eb8ff02c1df279bf7f7f29b866beb655a9eebe9 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 3 Aug 2023 15:17:11 +0800 Subject: [PATCH 090/113] md: Hold mddev->reconfig_mutex when trying to get mddev->sync_thread Commit ba9d9f1a707f ("Revert "md: unlock mddev before reap sync_thread in action_store"") removed the scenario of calling md_unregister_thread() without holding mddev->reconfig_mutex, so add a lock holding check before acquiring mddev->sync_thread by passing mdev to md_unregister_thread(). Signed-off-by: Li Lingfeng Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230803071711.2546560-1-lilingfeng@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 8 ++++---- drivers/md/md.c | 9 +++++---- drivers/md/md.h | 2 +- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 2 +- drivers/md/raid5-cache.c | 2 +- drivers/md/raid5.c | 2 +- 7 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 3d9fd74233dfd..1e26eb2233495 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -952,8 +952,8 @@ static int join(struct mddev *mddev, int nodes) return 0; err: set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - md_unregister_thread(&cinfo->recovery_thread); - md_unregister_thread(&cinfo->recv_thread); + md_unregister_thread(mddev, &cinfo->recovery_thread); + md_unregister_thread(mddev, &cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); @@ -1015,8 +1015,8 @@ static int leave(struct mddev *mddev) resync_bitmap(mddev); set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - md_unregister_thread(&cinfo->recovery_thread); - md_unregister_thread(&cinfo->recv_thread); + md_unregister_thread(mddev, &cinfo->recovery_thread); + md_unregister_thread(mddev, &cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); diff --git a/drivers/md/md.c b/drivers/md/md.c index a3d98273b295c..5c3c19b8d5099 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6258,7 +6258,7 @@ static void mddev_detach(struct mddev *mddev) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); if (mddev->queue) blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ } @@ -7990,9 +7990,10 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread __rcu **threadp) +void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) { - struct md_thread *thread = rcu_dereference_protected(*threadp, true); + struct md_thread *thread = rcu_dereference_protected(*threadp, + lockdep_is_held(&mddev->reconfig_mutex)); if (!thread) return; @@ -9484,7 +9485,7 @@ void md_reap_sync_thread(struct mddev *mddev) bool is_reshaped = false; /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); + md_unregister_thread(mddev, &mddev->sync_thread); atomic_inc(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && diff --git a/drivers/md/md.h b/drivers/md/md.h index 8ae9574809763..9bcb77bca9639 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -761,7 +761,7 @@ extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); -extern void md_unregister_thread(struct md_thread __rcu **threadp); +extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b920f92780139..c18b7c096c8df 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3156,7 +3156,7 @@ static int raid1_run(struct mddev *mddev) * RAID1 needs at least one disk in active */ if (conf->raid_disks - mddev->degraded < 1) { - md_unregister_thread(&conf->thread); + md_unregister_thread(mddev, &conf->thread); ret = -EINVAL; goto abort; } @@ -3183,7 +3183,7 @@ static int raid1_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) { - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); goto abort; } return 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7704a4c7f4695..0234131208516 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4320,7 +4320,7 @@ static int raid10_run(struct mddev *mddev) return 0; out_free_conf: - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); raid10_free_conf(conf); mddev->private = NULL; out: diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 2eac4a50d99bd..a29b9650260a0 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -3168,7 +3168,7 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - md_unregister_thread(&log->reclaim_thread); + md_unregister_thread(conf->mddev, &log->reclaim_thread); /* * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 32a87193bad73..4cb9c608ee191 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8107,7 +8107,7 @@ static int raid5_run(struct mddev *mddev) return 0; abort: - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); print_raid5_conf(conf); free_conf(conf); mddev->private = NULL; From 5afcf28d07dee91e48d1c809ebd19c3bfc403765 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:07 +0800 Subject: [PATCH 091/113] raid6: remove the include from recov.c There is no exported symbol left in recov.c, so the include is now unnecessary, and breaks the raid6test build. Remove it. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-2-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/recov.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index e49d519de6cbe..a7c1b2bbe40d8 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -13,7 +13,6 @@ * the syndrome.) */ -#include #include /* Recover two failed data blocks. */ From 9dd6e1da811ffad95a79b2690110ef1bbaf4dda4 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:08 +0800 Subject: [PATCH 092/113] raid6: guard the tables.c include of with __KERNEL__ The export directives for the tables are already emitted with __KERNEL__ guards, but the include is not, causing errors when building the raid6test program. Guard this include too to fix the raid6test build. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-3-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/mktables.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index f02e10fa62381..3be03793237c2 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c @@ -56,7 +56,9 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; + printf("#ifdef __KERNEL__\n"); printf("#include \n"); + printf("#endif\n"); printf("#include \n"); /* Compute multiplication table */ From 2008d89fb6435a3a900b72b856a18e0cc0d2c057 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:09 +0800 Subject: [PATCH 093/113] raid6: test: cosmetic cleanups for the test Makefile Use tabs/spaces consistently: hard tabs for marking recipe lines only, spaces for everything else. Also, the OPTFLAGS declaration actually included the tabs preceding the line comment, making compiler invocation lines unnecessarily long. As the entire block of declarations are meant for ad-hoc customization (otherwise they would probably make use of `?=` instead of `=`), move the "Adjust as desired" comment above the block too to fix the long invocation lines. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-4-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/test/Makefile | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 4fb7700a741bd..143cda60faa12 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -6,14 +6,15 @@ pound := \# -CC = gcc -OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) -LD = ld -AWK = awk -f -AR = ar -RANLIB = ranlib -OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o +# Adjust as desired +CC = gcc +OPTFLAGS = -O2 +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) +LD = ld +AWK = awk -f +AR = ar +RANLIB = ranlib +OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) ifeq ($(ARCH),i386) @@ -37,9 +38,9 @@ endif ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += -DCONFIG_X86 - CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ - gcc -c -x assembler - >/dev/null 2>&1 && \ - rm ./-.o && echo -DCONFIG_AS_AVX512=1) + CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 @@ -63,12 +64,12 @@ endif %.uc: ../%.uc cp -f $< $@ -all: raid6.a raid6test +all: raid6.a raid6test raid6.a: $(OBJS) - rm -f $@ - $(AR) cq $@ $^ - $(RANLIB) $@ + rm -f $@ + $(AR) cq $@ $^ + $(RANLIB) $@ raid6test: test.c raid6.a $(CC) $(CFLAGS) -o raid6test $^ From 6601f5e122e5fdcea0fa5eaa54b88b02dbc9ec07 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:10 +0800 Subject: [PATCH 094/113] raid6: test: make sure all intermediate and artifact files are .gitignored Currently when the raid6test utility is built, the resulting binary and an int.uc file are not being ignored, which can get inadvertently committed as a result when one works on the raid6 code. Ignore them to make `git status` clean at all times. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-5-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/test/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 lib/raid6/test/.gitignore diff --git a/lib/raid6/test/.gitignore b/lib/raid6/test/.gitignore new file mode 100644 index 0000000000000..1b68a77f348f6 --- /dev/null +++ b/lib/raid6/test/.gitignore @@ -0,0 +1,3 @@ +/int.uc +/neon.uc +/raid6test From 7b3c70c43c13ad8e59f5561b154663d6bdb77021 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Mon, 31 Jul 2023 18:49:11 +0800 Subject: [PATCH 095/113] raid6: test: only check for Altivec if building on powerpc hosts Altivec is only available for powerpc hosts, so only check for its availability when the host is powerpc, to avoid error messages being shown on architectures other than x86, arm or powerpc. Signed-off-by: WANG Xuerui Link: https://lore.kernel.org/r/20230731104911.411964-6-kernel@xen0n.name Signed-off-by: Song Liu --- lib/raid6/test/Makefile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 143cda60faa12..1f693ea3b980c 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -35,6 +35,12 @@ ifeq ($(ARCH),aarch64) HAS_NEON = yes endif +ifeq ($(findstring ppc,$(ARCH)),ppc) + CFLAGS += -I../../../arch/powerpc/include + HAS_ALTIVEC := $(shell printf '$(pound)include \nvector int a;\n' |\ + gcc -c -x c - >/dev/null && rm ./-.o && echo yes) +endif + ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += -DCONFIG_X86 @@ -44,15 +50,10 @@ ifeq ($(IS_X86),yes) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 -else - HAS_ALTIVEC := $(shell printf '$(pound)include \nvector int a;\n' |\ - gcc -c -x c - >/dev/null && rm ./-.o && echo yes) - ifeq ($(HAS_ALTIVEC),yes) - CFLAGS += -I../../../arch/powerpc/include - CFLAGS += -DCONFIG_ALTIVEC - OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ - vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o - endif +else ifeq ($(HAS_ALTIVEC),yes) + CFLAGS += -DCONFIG_ALTIVEC + OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ + vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o endif .c.o: From 0d0bd28c500173bfca78aa840f8f36d261ef1765 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 8 Aug 2023 18:49:12 +0800 Subject: [PATCH 096/113] md/raid5-cache: fix null-ptr-deref for r5l_flush_stripe_to_raid() r5l_flush_stripe_to_raid() will check if the list 'flushing_ios' is empty, and then submit 'flush_bio', however, r5l_log_flush_endio() is clearing the list first and then clear the bio, which will cause null-ptr-deref: T1: submit flush io raid5d handle_active_stripes r5l_flush_stripe_to_raid // list is empty // add 'io_end_ios' to the list bio_init submit_bio // io1 T2: io1 is done r5l_log_flush_endio list_splice_tail_init // clear the list T3: submit new flush io ... r5l_flush_stripe_to_raid // list is empty // add 'io_end_ios' to the list bio_init bio_uninit // clear bio->bi_blkg submit_bio // null-ptr-deref Fix this problem by clearing bio before clearing the list in r5l_log_flush_endio(). Fixes: 0dd00cba99c3 ("raid5-cache: fully initialize flush_bio when needed") Reported-and-tested-by: Corey Hickey Closes: https://lore.kernel.org/all/cddd7213-3dfd-4ab7-a3ac-edd54d74a626@fatooh.org/ Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index a29b9650260a0..518b7cfa78b9d 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1260,14 +1260,13 @@ static void r5l_log_flush_endio(struct bio *bio) if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); + bio_uninit(bio); spin_lock_irqsave(&log->io_list_lock, flags); list_for_each_entry(io, &log->flushing_ios, log_sibling) r5l_io_run_stripes(io); list_splice_tail_init(&log->flushing_ios, &log->finished_ios); spin_unlock_irqrestore(&log->io_list_lock, flags); - - bio_uninit(bio); } /* From f4283bc7e38ac89d0c6c0ae188464d3769bec098 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 16 Aug 2023 10:22:10 +0800 Subject: [PATCH 097/113] drivers/rnbd: restore sysfs interface to rnbd-client Commit 137380c0ec40 renamed 'rnbd-client' to 'rnbd_client', this changed sysfs interface to /sys/devices/virtual/rnbd_client/ctl/map_device from /sys/devices/virtual/rnbd-client/ctl/map_device. CC: Ivan Orlov CC: "Md. Haris Iqbal" CC: Jack Wang Fixes: 137380c0ec40 ("block/rnbd: make all 'class' structures const") Signed-off-by: Li Zhijian Acked-by: Jack Wang Link: https://lore.kernel.org/r/20230816022210.2501228-1-lizhijian@fujitsu.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index c36d8b1ceeed7..39887556cf959 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -25,7 +25,7 @@ static struct device *rnbd_dev; static const struct class rnbd_dev_class = { - .name = "rnbd_client", + .name = "rnbd-client", }; static struct kobject *rnbd_devs_kobj; From ec14a87ee1999b19d8b7ed0fa95fea80644624ae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Aug 2023 09:56:23 -1000 Subject: [PATCH 098/113] blk-cgroup: Fix NULL deref caused by blkg_policy_data being installed before init blk-iocost sometimes causes the following crash: BUG: kernel NULL pointer dereference, address: 00000000000000e0 ... RIP: 0010:_raw_spin_lock+0x17/0x30 Code: be 01 02 00 00 e8 79 38 39 ff 31 d2 89 d0 5d c3 0f 1f 00 0f 1f 44 00 00 55 48 89 e5 65 ff 05 48 d0 34 7e b9 01 00 00 00 31 c0 0f b1 0f 75 02 5d c3 89 c6 e8 ea 04 00 00 5d c3 0f 1f 84 00 00 RSP: 0018:ffffc900023b3d40 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 00000000000000e0 RCX: 0000000000000001 RDX: ffffc900023b3d20 RSI: ffffc900023b3cf0 RDI: 00000000000000e0 RBP: ffffc900023b3d40 R08: ffffc900023b3c10 R09: 0000000000000003 R10: 0000000000000064 R11: 000000000000000a R12: ffff888102337000 R13: fffffffffffffff2 R14: ffff88810af408c8 R15: ffff8881070c3600 FS: 00007faaaf364fc0(0000) GS:ffff88842fdc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000e0 CR3: 00000001097b1000 CR4: 0000000000350ea0 Call Trace: ioc_weight_write+0x13d/0x410 cgroup_file_write+0x7a/0x130 kernfs_fop_write_iter+0xf5/0x170 vfs_write+0x298/0x370 ksys_write+0x5f/0xb0 __x64_sys_write+0x1b/0x20 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 This happens because iocg->ioc is NULL. The field is initialized by ioc_pd_init() and never cleared. The NULL deref is caused by blkcg_activate_policy() installing blkg_policy_data before initializing it. blkcg_activate_policy() was doing the following: 1. Allocate pd's for all existing blkg's and install them in blkg->pd[]. 2. Initialize all pd's. 3. Online all pd's. blkcg_activate_policy() only grabs the queue_lock and may release and re-acquire the lock as allocation may need to sleep. ioc_weight_write() grabs blkcg->lock and iterates all its blkg's. The two can race and if ioc_weight_write() runs during #1 or between #1 and #2, it can encounter a pd which is not initialized yet, leading to crash. The crash can be reproduced with the following script: #!/bin/bash echo +io > /sys/fs/cgroup/cgroup.subtree_control systemd-run --unit touch-sda --scope dd if=/dev/sda of=/dev/null bs=1M count=1 iflag=direct echo 100 > /sys/fs/cgroup/system.slice/io.weight bash -c "echo '8:0 enable=1' > /sys/fs/cgroup/io.cost.qos" & sleep .2 echo 100 > /sys/fs/cgroup/system.slice/io.weight with the following patch applied: > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c > index fc49be622e05..38d671d5e10c 100644 > --- a/block/blk-cgroup.c > +++ b/block/blk-cgroup.c > @@ -1553,6 +1553,12 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) > pd->online = false; > } > > + if (system_state == SYSTEM_RUNNING) { > + spin_unlock_irq(&q->queue_lock); > + ssleep(1); > + spin_lock_irq(&q->queue_lock); > + } > + > /* all allocated, init in the same order */ > if (pol->pd_init_fn) > list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) I don't see a reason why all pd's should be allocated, initialized and onlined together. The only ordering requirement is that parent blkgs to be initialized and onlined before children, which is guaranteed from the walking order. Let's fix the bug by allocating, initializing and onlining pd for each blkg and holding blkcg->lock over initialization and onlining. This ensures that an installed blkg is always fully initialized and onlined removing the the race window. Signed-off-by: Tejun Heo Reported-by: Breno Leitao Fixes: 9d179b865449 ("blkcg: Fix multiple bugs in blkcg_activate_policy()") Link: https://lore.kernel.org/r/ZN0p5_W-Q9mAHBVY@slm.duckdns.org Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fc49be622e05b..638400bf049f2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1509,7 +1509,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) retry: spin_lock_irq(&q->queue_lock); - /* blkg_list is pushed at the head, reverse walk to allocate parents first */ + /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; @@ -1547,21 +1547,20 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) goto enomem; } - blkg->pd[pol->plid] = pd; + spin_lock(&blkg->blkcg->lock); + pd->blkg = blkg; pd->plid = pol->plid; - pd->online = false; - } + blkg->pd[pol->plid] = pd; - /* all allocated, init in the same order */ - if (pol->pd_init_fn) - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) - pol->pd_init_fn(blkg->pd[pol->plid]); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { if (pol->pd_online_fn) - pol->pd_online_fn(blkg->pd[pol->plid]); - blkg->pd[pol->plid]->online = true; + pol->pd_online_fn(pd); + pd->online = true; + + spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); @@ -1578,14 +1577,19 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) return ret; enomem: - /* alloc failed, nothing's initialized yet, free everything */ + /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; + struct blkg_policy_data *pd; spin_lock(&blkcg->lock); - if (blkg->pd[pol->plid]) { - pol->pd_free_fn(blkg->pd[pol->plid]); + pd = blkg->pd[pol->plid]; + if (pd) { + if (pd->online && pol->pd_offline_fn) + pol->pd_offline_fn(pd); + pd->online = false; + pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); From c5d736f548ec5aab7e877872417ac23a5c42f1fd Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Mon, 14 Aug 2023 21:53:54 +0800 Subject: [PATCH 099/113] md/raid1: call free_r1bio() before allow_barrier() in raid_end_bio_io() After allow_barrier, a concurrent raid1_reshape() will replace old mempool and r1conf::raid_disks. Move allow_barrier() to the end of raid_end_bio_io(), so that r1bio can be freed safely. Reviewed-by: Yu Kuai Signed-off-by: Xueshi Hu Link: https://lore.kernel.org/r/20230814135356.1113639-2-xueshi.hu@smartx.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c18b7c096c8df..642c8bae0df0c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -311,6 +311,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio) { struct bio *bio = r1_bio->master_bio; struct r1conf *conf = r1_bio->mddev->private; + sector_t sector = r1_bio->sector; /* if nobody has done the final endio yet, do it now */ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { @@ -321,13 +322,13 @@ static void raid_end_bio_io(struct r1bio *r1_bio) call_bio_endio(r1_bio); } + + free_r1bio(r1_bio); /* * Wake up any possible resync thread that waits for the device * to go idle. All I/Os, even write-behind writes, are done. */ - allow_barrier(conf, r1_bio->sector); - - free_r1bio(r1_bio); + allow_barrier(conf, sector); } /* From 992db13a4aee766c8bfbf046ad15c2db5fa7cab8 Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Mon, 14 Aug 2023 21:53:55 +0800 Subject: [PATCH 100/113] md/raid1: free the r1bio before waiting for blocked rdev Raid1 reshape will change mempool and r1conf::raid_disks which are needed to free r1bio. allow_barrier() make a concurrent raid1_reshape() possible. So, free the in-flight r1bio before waiting blocked rdev. Fixes: 6bfe0b499082 ("md: support blocking writes to an array on device failure") Reviewed-by: Yu Kuai Signed-off-by: Xueshi Hu Link: https://lore.kernel.org/r/20230814135356.1113639-3-xueshi.hu@smartx.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 642c8bae0df0c..b3fc44157cfcc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1378,6 +1378,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, return; } + retry_write: r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; @@ -1393,7 +1394,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ disks = conf->raid_disks * 2; - retry_write: blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1473,7 +1473,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - r1_bio->state = 0; + free_r1bio(r1_bio); allow_barrier(conf, bio->bi_iter.bi_sector); if (bio->bi_opf & REQ_NOWAIT) { From c069da449a13669ffa754fd971747e7e17e7d691 Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Mon, 14 Aug 2023 21:53:56 +0800 Subject: [PATCH 101/113] md/raid1: hold the barrier until handle_read_error() finishes handle_read_error() will call allow_barrier() to match the former barrier raising. However, it should put the allow_barrier() at the end to avoid a concurrent raid reshape. Fixes: 689389a06ce7 ("md/raid1: simplify handle_read_error().") Reviewed-by: Yu Kuai Signed-off-by: Xueshi Hu Link: https://lore.kernel.org/r/20230814135356.1113639-4-xueshi.hu@smartx.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b3fc44157cfcc..56f2725a996ff 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2511,6 +2511,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) struct mddev *mddev = conf->mddev; struct bio *bio; struct md_rdev *rdev; + sector_t sector; clear_bit(R1BIO_ReadError, &r1_bio->state); /* we got a read error. Maybe the drive is bad. Maybe just @@ -2540,12 +2541,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) } rdev_dec_pending(rdev, conf->mddev); - allow_barrier(conf, r1_bio->sector); + sector = r1_bio->sector; bio = r1_bio->master_bio; /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */ r1_bio->state = 0; raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio); + allow_barrier(conf, sector); } static void raid1d(struct md_thread *thread) From 6b2460e66ce6d483b5ff77227ac799d6e8a9ebd6 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Mon, 14 Aug 2023 14:01:15 +0200 Subject: [PATCH 102/113] md raid1: allow writebehind to work on any leg device set WriteMostly As the WriteMostly flag can be set on any component device of a RAID1 array, remove the constraint that it only works if set on the first one. Signed-off-by: Heinz Mauelshagen Tested-by: Xiao Ni Link: https://lore.kernel.org/r/2a9592bf3340f34bf588eec984b23ee219f3985e.1692013451.git.heinzm@redhat.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 56f2725a996ff..4b30a17421623 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1523,8 +1523,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - if (bitmap && - test_bit(WriteMostly, &rdev->flags) && + if (bitmap && write_behind && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && !waitqueue_active(&bitmap->behind_wait)) { From af50e20afb401cc203bd2a9ff62ece0ae4976103 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Aug 2023 11:27:07 +0200 Subject: [PATCH 103/113] md/raid0: Factor out helper for mapping and submitting a bio Factor out helper function for mapping and submitting a bio out of raid0_make_request(). We will use it later for submitting both parts of a split bio. Signed-off-by: Jan Kara Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230814092720.3931-1-jack@suse.cz Signed-off-by: Song Liu --- drivers/md/raid0.c | 79 +++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 4106d943aae75..91b24c510b7f6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -545,54 +545,21 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); } -static bool raid0_make_request(struct mddev *mddev, struct bio *bio) +static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) { struct r0conf *conf = mddev->private; struct strip_zone *zone; struct md_rdev *tmp_dev; - sector_t bio_sector; - sector_t sector; - sector_t orig_sector; - unsigned chunk_sects; - unsigned sectors; - - if (unlikely(bio->bi_opf & REQ_PREFLUSH) - && md_flush_request(mddev, bio)) - return true; - - if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { - raid0_handle_discard(mddev, bio); - return true; - } - - bio_sector = bio->bi_iter.bi_sector; - sector = bio_sector; - chunk_sects = mddev->chunk_sectors; - - sectors = chunk_sects - - (likely(is_power_of_2(chunk_sects)) - ? (sector & (chunk_sects-1)) - : sector_div(sector, chunk_sects)); - - /* Restore due to sector_div */ - sector = bio_sector; - - if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, - &mddev->bio_set); - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; - } + sector_t bio_sector = bio->bi_iter.bi_sector; + sector_t sector = bio_sector; if (bio->bi_pool != &mddev->bio_set) md_account_bio(mddev, &bio); - orig_sector = sector; zone = find_zone(mddev->private, §or); switch (conf->layout) { case RAID0_ORIG_LAYOUT: - tmp_dev = map_sector(mddev, zone, orig_sector, §or); + tmp_dev = map_sector(mddev, zone, bio_sector, §or); break; case RAID0_ALT_MULTIZONE_LAYOUT: tmp_dev = map_sector(mddev, zone, sector, §or); @@ -600,13 +567,13 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) default: WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); bio_io_error(bio); - return true; + return; } if (unlikely(is_rdev_broken(tmp_dev))) { bio_io_error(bio); md_error(mddev, tmp_dev); - return true; + return; } bio_set_dev(bio, tmp_dev->bdev); @@ -618,6 +585,40 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio_sector); mddev_check_write_zeroes(mddev, bio); submit_bio_noacct(bio); +} + +static bool raid0_make_request(struct mddev *mddev, struct bio *bio) +{ + sector_t sector; + unsigned chunk_sects; + unsigned sectors; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { + raid0_handle_discard(mddev, bio); + return true; + } + + sector = bio->bi_iter.bi_sector; + chunk_sects = mddev->chunk_sectors; + + sectors = chunk_sects - + (likely(is_power_of_2(chunk_sects)) + ? (sector & (chunk_sects-1)) + : sector_div(sector, chunk_sects)); + + if (sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, sectors, GFP_NOIO, + &mddev->bio_set); + bio_chain(split, bio); + submit_bio_noacct(bio); + bio = split; + } + + raid0_map_submit_bio(mddev, bio); return true; } From 319ff40a542736d67e5bce18635de35d0e7a0bff Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Aug 2023 11:27:08 +0200 Subject: [PATCH 104/113] md/raid0: Fix performance regression for large sequential writes Commit f00d7c85be9e ("md/raid0: fix up bio splitting.") among other things changed how bio that needs to be split is submitted. Before this commit, we have split the bio, mapped and submitted each part. After this commit, we map only the first part of the split bio and submit the second part unmapped. Due to bio sorting in __submit_bio_noacct() this results in the following request ordering: 9,0 18 1181 0.525037895 15995 Q WS 1479315464 + 63392 Split off chunk-sized (1024 sectors) request: 9,0 18 1182 0.629019647 15995 X WS 1479315464 / 1479316488 Request is unaligned to the chunk so it's split in raid0_make_request(). This is the first part mapped and punted to bio_list: 8,0 18 7053 0.629020455 15995 A WS 739921928 + 1016 <- (9,0) 1479315464 Now raid0_make_request() returns, second part is postponed on bio_list. __submit_bio_noacct() resorts the bio_list, mapped request is submitted to the underlying device: 8,0 18 7054 0.629022782 15995 G WS 739921928 + 1016 Now we take another request from the bio_list which is the remainder of the original huge request. Split off another chunk-sized bit from it and the situation repeats: 9,0 18 1183 0.629024499 15995 X WS 1479316488 / 1479317512 8,16 18 6998 0.629025110 15995 A WS 739921928 + 1016 <- (9,0) 1479316488 8,16 18 6999 0.629026728 15995 G WS 739921928 + 1016 ... 9,0 18 1184 0.629032940 15995 X WS 1479317512 / 1479318536 [libnetacq-write] 8,0 18 7059 0.629033294 15995 A WS 739922952 + 1016 <- (9,0) 1479317512 8,0 18 7060 0.629033902 15995 G WS 739922952 + 1016 ... This repeats until we consume the whole original huge request. Now we finally get to processing the second parts of the split off requests (in reverse order): 8,16 18 7181 0.629161384 15995 A WS 739952640 + 8 <- (9,0) 1479377920 8,0 18 7239 0.629162140 15995 A WS 739952640 + 8 <- (9,0) 1479376896 8,16 18 7186 0.629163881 15995 A WS 739951616 + 8 <- (9,0) 1479375872 8,0 18 7242 0.629164421 15995 A WS 739951616 + 8 <- (9,0) 1479374848 ... I guess it is obvious that this IO pattern is extremely inefficient way to perform sequential IO. It also makes bio_list to grow to rather long lengths. Change raid0_make_request() to map both parts of the split bio. Since we know we are provided with at most chunk-sized bios, we will always need to split the incoming bio at most once. Fixes: f00d7c85be9e ("md/raid0: fix up bio splitting.") Signed-off-by: Jan Kara Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230814092720.3931-2-jack@suse.cz Signed-off-by: Song Liu --- drivers/md/raid0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 91b24c510b7f6..abbd77977f984 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -614,7 +614,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) struct bio *split = bio_split(bio, sectors, GFP_NOIO, &mddev->bio_set); bio_chain(split, bio); - submit_bio_noacct(bio); + raid0_map_submit_bio(mddev, bio); bio = split; } From cc22b5407e9ca76adb7efeed843146510b1b72a5 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Wed, 16 Aug 2023 14:13:55 -0400 Subject: [PATCH 105/113] md: raid0: account for split bio in iostat accounting When a bio is split by md raid0, the newly created bio will not be tracked by md for I/O accounting. Only the portion of I/O still assigned to the original bio which was reduced by the split will be accounted for. This results in md iostat data sometimes showing I/O values far below the actual amount of data being sent through md. md_account_bio() needs to be called for all bio generated by the bio split. A simple example of the issue was generated using a raid0 device on partitions to the same device. Since all raid0 I/O then goes to one device, it makes it easy to see a gap between the md device and its sd storage. Reading an lvm device on top of the md device, the iostat output (some 0 columns and extra devices removed to make the data more compact) was: Device tps kB_read/s kB_wrtn/s kB_dscd/s kB_read md2 0.00 0.00 0.00 0.00 0 sde 0.00 0.00 0.00 0.00 0 md2 1364.00 411496.00 0.00 0.00 411496 sde 1734.00 646144.00 0.00 0.00 646144 md2 1699.00 510680.00 0.00 0.00 510680 sde 2155.00 802784.00 0.00 0.00 802784 md2 803.00 241480.00 0.00 0.00 241480 sde 1016.00 377888.00 0.00 0.00 377888 md2 0.00 0.00 0.00 0.00 0 sde 0.00 0.00 0.00 0.00 0 I/O was generated doing large direct I/O reads (12M) with dd to a linear lvm volume on top of the 4 leg raid0 device. The md2 reads were showing as roughly 2/3 of the reads to the sde device containing all of md2's raid partitions. The sum of reads to sde was 1826816 kB, which was the expected amount as it was the amount read by dd. With the patch, the total reads from md will match the reads from sde and be consistent with the amount of I/O generated. Fixes: 10764815ff47 ("md: add io accounting for raid0 and raid5") Signed-off-by: David Jeffery Tested-by: Laurence Oberman Reviewed-by: Laurence Oberman Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230816181433.13289-1-djeffery@redhat.com --- drivers/md/raid0.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index abbd77977f984..c50a7abda744a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -553,8 +553,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) sector_t bio_sector = bio->bi_iter.bi_sector; sector_t sector = bio_sector; - if (bio->bi_pool != &mddev->bio_set) - md_account_bio(mddev, &bio); + md_account_bio(mddev, &bio); zone = find_zone(mddev->private, §or); switch (conf->layout) { From 851e06297f20bbd85c93bbf09469f2150d1db218 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 10 Aug 2023 20:43:26 +0800 Subject: [PATCH 106/113] ublk: zoned: support REQ_OP_ZONE_RESET_ALL There isn't any reason to not support REQ_OP_ZONE_RESET_ALL given everything is actually handled in userspace, not mention it is pretty easy to support RESET_ALL. So enable REQ_OP_ZONE_RESET_ALL and let userspace handle it. Verified by 'tools/zbc_reset_zone -all /dev/ublkb0' in libzbc[1] with libublk-rs based ublk-zoned target prototype[2], follows command line for creating ublk-zoned: cargo run --example zoned -- add -1 1024 # add $dev_id $DEV_SIZE [1] https://github.com/westerndigitalcorporation/libzbc [2] https://github.com/ming1/libublk-rs/tree/zoned.v2 Cc: Niklas Cassel Cc: Damien Le Moal Cc: Andreas Hindborg Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230810124326.321472-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 7 +++++-- include/uapi/linux/ublk_cmd.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6ecd728ee5cea..2d9cb59f8027f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -251,6 +251,7 @@ static int ublk_dev_param_zoned_apply(struct ublk_device *ub) const struct ublk_param_zoned *p = &ub->params.zoned; disk_set_zoned(ub->ub_disk, BLK_ZONED_HM); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); blk_queue_required_elevator_features(ub->ub_disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE); disk_set_max_active_zones(ub->ub_disk, p->max_active_zones); @@ -393,6 +394,9 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, case REQ_OP_ZONE_APPEND: ublk_op = UBLK_IO_OP_ZONE_APPEND; break; + case REQ_OP_ZONE_RESET_ALL: + ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; + break; case REQ_OP_DRV_IN: ublk_op = pdu->operation; switch (ublk_op) { @@ -404,9 +408,8 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, default: return BLK_STS_IOERR; } - case REQ_OP_ZONE_RESET_ALL: case REQ_OP_DRV_OUT: - /* We do not support reset_all and drv_out */ + /* We do not support drv_out */ return BLK_STS_NOTSUPP; default: return BLK_STS_IOERR; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 2685e53e47521..b9cfc5c962682 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -245,6 +245,7 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_OP_ZONE_CLOSE 11 #define UBLK_IO_OP_ZONE_FINISH 12 #define UBLK_IO_OP_ZONE_APPEND 13 +#define UBLK_IO_OP_ZONE_RESET_ALL 14 #define UBLK_IO_OP_ZONE_RESET 15 /* * Construct a zone report. The report request is carried in `struct From e1dd7bc93029024af5688253b0c05181d6e01f8e Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 21 Aug 2023 17:56:00 +0800 Subject: [PATCH 107/113] blk-mq: fix tags leak when shrink nr_hw_queues Although we don't need to realloc set->tags[] when shrink nr_hw_queues, we need to free them. Or these tags will be leaked. How to reproduce: 1. mount -t configfs configfs /mnt 2. modprobe null_blk nr_devices=0 submit_queues=8 3. mkdir /mnt/nullb/nullb0 4. echo 1 > /mnt/nullb/nullb0/power 5. echo 4 > /mnt/nullb/nullb0/submit_queues 6. rmdir /mnt/nullb/nullb0 In step 4, will alloc 9 tags (8 submit queues and 1 poll queue), then in step 5, new_nr_hw_queues = 5 (4 submit queues and 1 poll queue). At last in step 6, only these 5 tags are freed, the other 4 tags leaked. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230821095602.70742-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 687ec3f4f10d2..afad6d06eaf7e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4379,9 +4379,13 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, int new_nr_hw_queues) { struct blk_mq_tags **new_tags; + int i; - if (set->nr_hw_queues >= new_nr_hw_queues) + if (set->nr_hw_queues >= new_nr_hw_queues) { + for (i = new_nr_hw_queues; i < set->nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i); goto done; + } new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); From 2bc4d7a355a4d617452eaf1b21d6d261194b3667 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 21 Aug 2023 17:56:01 +0800 Subject: [PATCH 108/113] blk-mq: delete redundant tagset map update when fallback When we increase nr_hw_queues fail, the fallback path will use blk_mq_update_queue_map() to clear and update all maps. Obviously, this line of update of HCTX_TYPE_DEFAULT only is not needed, so delete it. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230821095602.70742-2-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index afad6d06eaf7e..22397ba815ca9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4730,7 +4730,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, __blk_mq_free_map_and_rqs(set, i); set->nr_hw_queues = prev_nr_hw_queues; - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); goto fallback; } blk_mq_map_swqueue(q); From 7222657e51b5626d10154b3e48ad441c33b5da96 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 21 Aug 2023 17:56:02 +0800 Subject: [PATCH 109/113] blk-mq: prealloc tags when increase tagset nr_hw_queues Just like blk_mq_alloc_tag_set(), it's better to prepare all tags before using to map to queue ctxs in blk_mq_map_swqueue(), which now have to consider empty set->tags[]. The good point is that we can fallback easily if increasing nr_hw_queues fail, instead of just mapping to hctx[0] when fail in blk_mq_map_swqueue(). And the fallback path already has tags free & clean handling, so all is good. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230821095602.70742-3-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 22397ba815ca9..84400157c5f4e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4397,6 +4397,16 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; + + for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { + if (!__blk_mq_alloc_map_and_rqs(set, i)) { + while (--i >= set->nr_hw_queues) + __blk_mq_free_map_and_rqs(set, i); + return -ENOMEM; + } + cond_resched(); + } + done: set->nr_hw_queues = new_nr_hw_queues; return 0; From 9fb10726ecc5145550180aec4fd0adf0a7b1d634 Mon Sep 17 00:00:00 2001 From: Greg Joyce Date: Fri, 21 Jul 2023 16:15:32 -0500 Subject: [PATCH 110/113] block: sed-opal: Implement IOC_OPAL_DISCOVERY Add IOC_OPAL_DISCOVERY ioctl to return raw discovery data to a SED Opal application. This allows the application to display drive capabilities and state. Signed-off-by: Greg Joyce Reviewed-by: Christoph Hellwig Reviewed-by: Jonathan Derrick Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20230721211534.3437070-2-gjoyce@linux.vnet.ibm.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 38 ++++++++++++++++++++++++++++++++--- include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 6 ++++++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index c18339446ef37..67c6c4f2b4b0f 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -463,8 +463,11 @@ static int execute_steps(struct opal_dev *dev, return error; } -static int opal_discovery0_end(struct opal_dev *dev) +static int opal_discovery0_end(struct opal_dev *dev, void *data) { + struct opal_discovery *discv_out = data; /* may be NULL */ + u8 __user *buf_out; + u64 len_out; bool found_com_id = false, supported = true, single_user = false; const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; @@ -480,6 +483,15 @@ static int opal_discovery0_end(struct opal_dev *dev) return -EFAULT; } + if (discv_out) { + buf_out = (u8 __user *)(uintptr_t)discv_out->data; + len_out = min_t(u64, discv_out->size, hlen); + if (buf_out && copy_to_user(buf_out, dev->resp, len_out)) + return -EFAULT; + + discv_out->size = hlen; /* actual size of data */ + } + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ @@ -565,13 +577,13 @@ static int opal_discovery0(struct opal_dev *dev, void *data) if (ret) return ret; - return opal_discovery0_end(dev); + return opal_discovery0_end(dev, data); } static int opal_discovery0_step(struct opal_dev *dev) { const struct opal_step discovery0_step = { - opal_discovery0, + opal_discovery0, NULL }; return execute_step(dev, &discovery0_step, 0); @@ -2435,6 +2447,22 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, return ret; } +static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) +{ + const struct opal_step discovery0_step = { + opal_discovery0, discv + }; + int ret = 0; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_step(dev, &discovery0_step, 0); + mutex_unlock(&dev->dev_lock); + if (ret) + return ret; + return discv->size; /* modified to actual length of data */ +} + static int opal_erase_locking_range(struct opal_dev *dev, struct opal_session_info *opal_session) { @@ -3056,6 +3084,10 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GET_GEOMETRY: ret = opal_get_geometry(dev, arg); break; + case IOC_OPAL_DISCOVERY: + ret = opal_get_discv(dev, p); + break; + default: break; } diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index bbae1e52ab4f5..ef65f589fbebb 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -47,6 +47,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_GET_STATUS: case IOC_OPAL_GET_LR_STATUS: case IOC_OPAL_GET_GEOMETRY: + case IOC_OPAL_DISCOVERY: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index dc2efd345133f..7f5732c5bdc50 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -173,6 +173,11 @@ struct opal_geometry { __u8 __align[3]; }; +struct opal_discovery { + __u64 data; + __u64 size; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -192,5 +197,6 @@ struct opal_geometry { #define IOC_OPAL_GET_STATUS _IOR('p', 236, struct opal_status) #define IOC_OPAL_GET_LR_STATUS _IOW('p', 237, struct opal_lr_status) #define IOC_OPAL_GET_GEOMETRY _IOR('p', 238, struct opal_geometry) +#define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) #endif /* _UAPI_SED_OPAL_H */ From 5c82efc1aee8eb0919aa67a0d2559de5a326bd7c Mon Sep 17 00:00:00 2001 From: Greg Joyce Date: Fri, 21 Jul 2023 16:15:33 -0500 Subject: [PATCH 111/113] block: sed-opal: Implement IOC_OPAL_REVERT_LSP This is used in conjunction with IOC_OPAL_REVERT_TPR to return a drive to Original Factory State without erasing the data. If IOC_OPAL_REVERT_LSP is called with opal_revert_lsp.options bit OPAL_PRESERVE set prior to calling IOC_OPAL_REVERT_TPR, the drive global locking range will not be erased. Signed-off-by: Greg Joyce Reviewed-by: Christoph Hellwig Reviewed-by: Jonathan Derrick Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20230721211534.3437070-3-gjoyce@linux.vnet.ibm.com Signed-off-by: Jens Axboe --- block/opal_proto.h | 4 ++++ block/sed-opal.c | 40 +++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 11 ++++++++++ 4 files changed, 56 insertions(+) diff --git a/block/opal_proto.h b/block/opal_proto.h index a4e56845dd826..dec7ce3a3edb7 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -225,6 +225,10 @@ enum opal_parameter { OPAL_SUM_SET_LIST = 0x060000, }; +enum opal_revertlsp { + OPAL_KEEP_GLOBAL_RANGE_KEY = 0x060000, +}; + /* Packets derived from: * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Secion: 3.2.3 ComPackets, Packets & Subpackets diff --git a/block/sed-opal.c b/block/sed-opal.c index 67c6c4f2b4b0f..e2aed7f4ebdf6 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1769,6 +1769,26 @@ static int internal_activate_user(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int revert_lsp(struct opal_dev *dev, void *data) +{ + struct opal_revert_lsp *rev = data; + int err; + + err = cmd_start(dev, opaluid[OPAL_THISSP_UID], + opalmethod[OPAL_REVERTSP]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_KEEP_GLOBAL_RANGE_KEY); + add_token_u8(&err, dev, (rev->options & OPAL_PRESERVE) ? + OPAL_TRUE : OPAL_FALSE); + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) { + pr_debug("Error building REVERT SP command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + static int erase_locking_range(struct opal_dev *dev, void *data) { struct opal_session_info *session = data; @@ -2463,6 +2483,23 @@ static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) return discv->size; /* modified to actual length of data */ } +static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev) +{ + /* controller will terminate session */ + const struct opal_step steps[] = { + { start_admin1LSP_opal_session, &rev->key }, + { revert_lsp, rev } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_erase_locking_range(struct opal_dev *dev, struct opal_session_info *opal_session) { @@ -3084,6 +3121,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GET_GEOMETRY: ret = opal_get_geometry(dev, arg); break; + case IOC_OPAL_REVERT_LSP: + ret = opal_revertlsp(dev, p); + break; case IOC_OPAL_DISCOVERY: ret = opal_get_discv(dev, p); break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index ef65f589fbebb..2f189546e1338 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -48,6 +48,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_GET_LR_STATUS: case IOC_OPAL_GET_GEOMETRY: case IOC_OPAL_DISCOVERY: + case IOC_OPAL_REVERT_LSP: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 7f5732c5bdc50..4e10675751b48 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -56,6 +56,10 @@ struct opal_key { __u8 key[OPAL_KEY_MAX]; }; +enum opal_revert_lsp_opts { + OPAL_PRESERVE = 0x01, +}; + struct opal_lr_act { struct opal_key key; __u32 sum; @@ -178,6 +182,12 @@ struct opal_discovery { __u64 size; }; +struct opal_revert_lsp { + struct opal_key key; + __u32 options; + __u32 __pad; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -198,5 +208,6 @@ struct opal_discovery { #define IOC_OPAL_GET_LR_STATUS _IOW('p', 237, struct opal_lr_status) #define IOC_OPAL_GET_GEOMETRY _IOR('p', 238, struct opal_geometry) #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) +#define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #endif /* _UAPI_SED_OPAL_H */ From 3bfeb61256643281ac4be5b8a57e9d9da3db4335 Mon Sep 17 00:00:00 2001 From: Greg Joyce Date: Fri, 21 Jul 2023 16:15:34 -0500 Subject: [PATCH 112/113] block: sed-opal: keyring support for SED keys Extend the SED block driver so it can alternatively obtain a key from a sed-opal kernel keyring. The SED ioctls will indicate the source of the key, either directly in the ioctl data or from the keyring. This allows the use of SED commands in scripts such as udev scripts so that drives may be automatically unlocked as they become available. Signed-off-by: Greg Joyce Reviewed-by: Jonathan Derrick Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20230721211534.3437070-4-gjoyce@linux.vnet.ibm.com Signed-off-by: Jens Axboe --- block/Kconfig | 2 + block/sed-opal.c | 174 +++++++++++++++++++++++++++++++++- include/linux/sed-opal.h | 3 + include/uapi/linux/sed-opal.h | 8 +- 4 files changed, 184 insertions(+), 3 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index 1a13ef0b1ca10..f1364d1c0d93e 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -184,6 +184,8 @@ config BLK_DEBUG_FS_ZONED config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" + depends on KEYS + select PSERIES_PLPKS if PPC_PSERIES help Builds Logic for interfacing with Opal enabled controllers. Enabling this option enables users to setup/unlock/lock diff --git a/block/sed-opal.c b/block/sed-opal.c index e2aed7f4ebdf6..6d7f25d1711ba 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include "opal_proto.h" @@ -29,6 +32,8 @@ /* Number of bytes needed by cmd_finalize. */ #define CMD_FINALIZE_BYTES_NEEDED 7 +static struct key *sed_opal_keyring; + struct opal_step { int (*fn)(struct opal_dev *dev, void *data); void *data; @@ -269,6 +274,101 @@ static void print_buffer(const u8 *ptr, u32 length) #endif } +/* + * Allocate/update a SED Opal key and add it to the SED Opal keyring. + */ +static int update_sed_opal_key(const char *desc, u_char *key_data, int keylen) +{ + key_ref_t kr; + + if (!sed_opal_keyring) + return -ENOKEY; + + kr = key_create_or_update(make_key_ref(sed_opal_keyring, true), "user", + desc, (const void *)key_data, keylen, + KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_WRITE, + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(kr)) { + pr_err("Error adding SED key (%ld)\n", PTR_ERR(kr)); + return PTR_ERR(kr); + } + + return 0; +} + +/* + * Read a SED Opal key from the SED Opal keyring. + */ +static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen) +{ + int ret; + key_ref_t kref; + struct key *key; + + if (!sed_opal_keyring) + return -ENOKEY; + + kref = keyring_search(make_key_ref(sed_opal_keyring, true), + &key_type_user, key_name, true); + + if (IS_ERR(kref)) + ret = PTR_ERR(kref); + + key = key_ref_to_ptr(kref); + down_read(&key->sem); + ret = key_validate(key); + if (ret == 0) { + if (buflen > key->datalen) + buflen = key->datalen; + + ret = key->type->read(key, (char *)buffer, buflen); + } + up_read(&key->sem); + + key_ref_put(kref); + + return ret; +} + +static int opal_get_key(struct opal_dev *dev, struct opal_key *key) +{ + int ret = 0; + + switch (key->key_type) { + case OPAL_INCLUDED: + /* the key is ready to use */ + break; + case OPAL_KEYRING: + /* the key is in the keyring */ + ret = read_sed_opal_key(OPAL_AUTH_KEY, key->key, OPAL_KEY_MAX); + if (ret > 0) { + if (ret > U8_MAX) { + ret = -ENOSPC; + goto error; + } + key->key_len = ret; + key->key_type = OPAL_INCLUDED; + } + break; + default: + ret = -EINVAL; + break; + } + if (ret < 0) + goto error; + + /* must have a PEK by now or it's an error */ + if (key->key_type != OPAL_INCLUDED || key->key_len == 0) { + ret = -EINVAL; + goto error; + } + return 0; +error: + pr_debug("Error getting password: %d\n", ret); + return ret; +} + static bool check_tper(const void *data) { const struct d0_tper_features *tper = data; @@ -2459,6 +2559,9 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); @@ -2492,6 +2595,9 @@ static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev) }; int ret; + ret = opal_get_key(dev, &rev->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); @@ -2510,6 +2616,9 @@ static int opal_erase_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); @@ -2538,6 +2647,9 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, opal_mbr->enable_disable != OPAL_MBR_DISABLE) return -EINVAL; + ret = opal_get_key(dev, &opal_mbr->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2563,6 +2675,9 @@ static int opal_set_mbr_done(struct opal_dev *dev, mbr_done->done_flag != OPAL_MBR_NOT_DONE) return -EINVAL; + ret = opal_get_key(dev, &mbr_done->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2584,6 +2699,9 @@ static int opal_write_shadow_mbr(struct opal_dev *dev, if (info->size == 0) return 0; + ret = opal_get_key(dev, &info->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2641,6 +2759,9 @@ static int opal_add_user_to_lr(struct opal_dev *dev, return -EINVAL; } + ret = opal_get_key(dev, &lk_unlk->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); @@ -2663,6 +2784,10 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psi int ret; + ret = opal_get_key(dev, opal); + + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); if (psid) @@ -2763,6 +2888,9 @@ static int opal_lock_unlock(struct opal_dev *dev, if (lk_unlk->session.who > OPAL_USER9) return -EINVAL; + ret = opal_get_key(dev, &lk_unlk->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); opal_lock_check_for_saved_key(dev, lk_unlk); ret = __opal_lock_unlock(dev, lk_unlk); @@ -2786,6 +2914,9 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) if (!dev) return -ENODEV; + ret = opal_get_key(dev, opal); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps)); @@ -2808,6 +2939,9 @@ static int opal_activate_lsp(struct opal_dev *dev, if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) return -EINVAL; + ret = opal_get_key(dev, &opal_lr_act->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); @@ -2826,6 +2960,9 @@ static int opal_setup_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); @@ -2879,6 +3016,14 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); mutex_unlock(&dev->dev_lock); + if (ret) + return ret; + + /* update keyring with new password */ + ret = update_sed_opal_key(OPAL_AUTH_KEY, + opal_pw->new_user_pw.opal_key.key, + opal_pw->new_user_pw.opal_key.key_len); + return ret; } @@ -2899,6 +3044,9 @@ static int opal_activate_user(struct opal_dev *dev, return -EINVAL; } + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps)); @@ -2985,6 +3133,9 @@ static int opal_generic_read_write_table(struct opal_dev *dev, { int ret, bit_set; + ret = opal_get_key(dev, &rw_tbl->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); @@ -3053,9 +3204,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!dev) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!(dev->flags & OPAL_FL_SUPPORTED)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (cmd & IOC_IN) { p = memdup_user(arg, _IOC_SIZE(cmd)); @@ -3137,3 +3288,22 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) return ret; } EXPORT_SYMBOL_GPL(sed_ioctl); + +static int __init sed_opal_init(void) +{ + struct key *kr; + + kr = keyring_alloc(".sed_opal", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | + KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(kr)) + return PTR_ERR(kr); + + sed_opal_keyring = kr; + + return 0; +} +late_initcall(sed_opal_init); diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 2f189546e1338..2ac50822554e8 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -25,6 +25,9 @@ bool opal_unlock_from_suspend(struct opal_dev *dev); struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv); int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); +#define OPAL_AUTH_KEY "opal-boot-pin" +#define OPAL_AUTH_KEY_PREV "opal-boot-pin-prev" + static inline bool is_sed_ioctl(unsigned int cmd) { switch (cmd) { diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 4e10675751b48..d3994b7716bc6 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -49,10 +49,16 @@ enum opal_lock_flags { OPAL_SAVE_FOR_LOCK = 0x01, }; +enum opal_key_type { + OPAL_INCLUDED = 0, /* key[] is the key */ + OPAL_KEYRING, /* key is in keyring */ +}; + struct opal_key { __u8 lr; __u8 key_len; - __u8 __align[6]; + __u8 key_type; + __u8 __align[5]; __u8 key[OPAL_KEY_MAX]; }; From 146afeb235ccec10c17ad8ea26327c0c79dbd968 Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Sat, 3 Dec 2022 14:22:58 +0800 Subject: [PATCH 113/113] block: use strscpy() to instead of strncpy() The implementation of strscpy() is more robust and safer. That's now the recommended way to copy NUL terminated strings. Signed-off-by: Xu Panda Signed-off-by: Yang Yang Reviewed-by: Justin Stitt Link: https://lore.kernel.org/r/202212031422587503771@zte.com.cn Signed-off-by: Jens Axboe --- block/partitions/cmdline.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 1af610f0ba8c6..c03bc105e5753 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -81,8 +81,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) length = min_t(int, next - partdef, sizeof(new_subpart->name) - 1); - strncpy(new_subpart->name, partdef, length); - new_subpart->name[length] = '\0'; + strscpy(new_subpart->name, partdef, length); partdef = ++next; } else @@ -140,8 +139,7 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) } length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strncpy(newparts->name, bdevdef, length); - newparts->name[length] = '\0'; + strscpy(newparts->name, bdevdef, length); newparts->nr_subparts = 0; next_subpart = &newparts->subpart; @@ -153,8 +151,7 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) length = (!next) ? (sizeof(buf) - 1) : min_t(int, next - bdevdef, sizeof(buf) - 1); - strncpy(buf, bdevdef, length); - buf[length] = '\0'; + strscpy(buf, bdevdef, length); ret = parse_subpart(next_subpart, buf); if (ret) @@ -267,8 +264,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart, label_min = min_t(int, sizeof(info->volname) - 1, sizeof(subpart->name)); - strncpy(info->volname, subpart->name, label_min); - info->volname[label_min] = '\0'; + strscpy(info->volname, subpart->name, label_min); snprintf(tmp, sizeof(tmp), "(%s)", info->volname); strlcat(state->pp_buf, tmp, PAGE_SIZE);