From deed1904512c7e44f449a522af6676a8640e5989 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 28 Mar 2025 08:25:08 -0600 Subject: [PATCH 01/42] nvme-loop: avoid -Wflex-array-member-not-at-end warning -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Move the conflicting declaration to the end of the structure. Notice that `struct nvme_loop_iod` is a flexible structure --a structure that contains a flexible-array member. Fix the following warning: drivers/nvme/target/loop.c:36:33: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index a5c41144667c..d02b80803278 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -33,10 +33,12 @@ struct nvme_loop_ctrl { struct list_head list; struct blk_mq_tag_set tag_set; - struct nvme_loop_iod async_event_iod; struct nvme_ctrl ctrl; struct nvmet_port *port; + + /* Must be last --ends in a flexible-array member. */ + struct nvme_loop_iod async_event_iod; }; static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) From 73becfd6d80307c35999ced6f213a8f16af3b01e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 15 Apr 2025 08:00:21 +0200 Subject: [PATCH 02/42] nvme-tcp: remove redundant check to ctrl->opts When checking for secure concatenation we have already validated that 'ctrl->opts' is set, so we can remove this check. Reported-by: Dan Carpenter Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index aba365f97cf6..c5dae87bc1c9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2385,7 +2385,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (ret) return ret; - if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) { + if (ctrl->opts->concat && !ctrl->tls_pskid) { /* See comments for nvme_tcp_key_revoke_needed() */ dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n"); nvme_stop_keep_alive(ctrl); From 674f872b7cefab4564ec8b34c799a4a653e34513 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 3 Apr 2025 08:55:20 +0200 Subject: [PATCH 03/42] nvme-tcp: open-code nvme_tcp_queue_request() for R2T When handling an R2T PDU we short-circuit nvme_tcp_queue_request() as we should not attempt to send consecutive PDUs. So open-code nvme_tcp_queue_request() for R2T and drop the last argument. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c5dae87bc1c9..853bc67d045c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -403,7 +403,7 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) } static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, - bool sync, bool last) + bool last) { struct nvme_tcp_queue *queue = req->queue; bool empty; @@ -417,7 +417,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, * are on the same cpu, so we don't introduce contention. */ if (queue->io_cpu == raw_smp_processor_id() && - sync && empty && mutex_trylock(&queue->send_mutex)) { + empty && mutex_trylock(&queue->send_mutex)) { nvme_tcp_send_all(queue); mutex_unlock(&queue->send_mutex); } @@ -770,7 +770,9 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, req->ttag = pdu->ttag; nvme_tcp_setup_h2c_data_pdu(req); - nvme_tcp_queue_request(req, false, true); + + llist_add(&req->lentry, &queue->req_list); + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); return 0; } @@ -2637,7 +2639,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; - nvme_tcp_queue_request(&ctrl->async_req, true, true); + nvme_tcp_queue_request(&ctrl->async_req, true); } static void nvme_tcp_complete_timed_out(struct request *rq) @@ -2789,7 +2791,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_start_request(rq); - nvme_tcp_queue_request(req, true, bd->last); + nvme_tcp_queue_request(req, bd->last); return BLK_STS_OK; } From 5df496e9ef12e85dcc21488069bbf1b0285aca00 Mon Sep 17 00:00:00 2001 From: Marcelo Moreira Date: Thu, 3 Apr 2025 20:23:56 -0300 Subject: [PATCH 04/42] nvmet: replace strncpy with strscpy The strncpy() function is deprecated for NUL-terminated strings as explained in the "strncpy() on NUL-terminated strings" section of Documentation/process/deprecated.rst. The key issues are: - strncpy() fails to guarantee NULL-termination when source > destination - it unnecessarily zero-pads short strings, causing performance overhead strscpy() is the proper replacement because: - it guarantees NULL-termination - it avoids redundant zero-padding - it aligns with current kernel string-copying best practice memcpy() was rejected because: - NQN buffers (subsysnqn/hostnqn) are treated as NULL-terminated strings: - strcmp() usage in nvmet_host_allowed() (discovery.c) - strscpy() to copy subsysnqn in nvmet_execute_disc_identify() seq_buf wasn't used because: - this is a simple fixed-size buffer copy - there is no need for progressive string construction features Signed-off-by: Marcelo Moreira Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/discovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index df7207640506..c06f3e04296c 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -119,7 +119,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE); memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); - strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); + strscpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); } /* From 6b868deaa1c3ad5737e59074985efccf39e7fea2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 25 Feb 2025 22:28:40 -0800 Subject: [PATCH 05/42] nvmet-tcp: switch to using the crc32c library Now that the crc32c() library function directly takes advantage of architecture-specific optimizations, it is unnecessary to go through the crypto API. Just use crc32c(). This is much simpler, and it improves performance due to eliminating the crypto API overhead. Signed-off-by: Eric Biggers Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 92 +++++++++++---------------------------- 1 file changed, 26 insertions(+), 66 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 12a5cb8641ca..e6997ce61027 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -17,7 +18,6 @@ #include #include #include -#include #include #include "nvmet.h" @@ -172,8 +172,6 @@ struct nvmet_tcp_queue { /* digest state */ bool hdr_digest; bool data_digest; - struct ahash_request *snd_hash; - struct ahash_request *rcv_hash; /* TLS state */ key_serial_t tls_pskid; @@ -294,14 +292,9 @@ static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; } -static inline void nvmet_tcp_hdgst(struct ahash_request *hash, - void *pdu, size_t len) +static inline void nvmet_tcp_hdgst(void *pdu, size_t len) { - struct scatterlist sg; - - sg_init_one(&sg, pdu, len); - ahash_request_set_crypt(hash, &sg, pdu + len, len); - crypto_ahash_digest(hash); + put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len); } static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, @@ -318,7 +311,7 @@ static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, } recv_digest = *(__le32 *)(pdu + hdr->hlen); - nvmet_tcp_hdgst(queue->rcv_hash, pdu, len); + nvmet_tcp_hdgst(pdu, len); exp_digest = *(__le32 *)(pdu + hdr->hlen); if (recv_digest != exp_digest) { pr_err("queue %d: header digest error: recv %#x expected %#x\n", @@ -441,12 +434,24 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) return NVME_SC_INTERNAL; } -static void nvmet_tcp_calc_ddgst(struct ahash_request *hash, - struct nvmet_tcp_cmd *cmd) +static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd) { - ahash_request_set_crypt(hash, cmd->req.sg, - (void *)&cmd->exp_ddgst, cmd->req.transfer_len); - crypto_ahash_digest(hash); + size_t total_len = cmd->req.transfer_len; + struct scatterlist *sg = cmd->req.sg; + u32 crc = ~0; + + while (total_len) { + size_t len = min_t(size_t, total_len, sg->length); + + /* + * Note that the scatterlist does not contain any highmem pages, + * as it was allocated by sgl_alloc() with GFP_KERNEL. + */ + crc = crc32c(crc, sg_virt(sg), len); + total_len -= len; + sg = sg_next(sg); + } + cmd->exp_ddgst = cpu_to_le32(~crc); } static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) @@ -473,19 +478,18 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) if (queue->data_digest) { pdu->hdr.flags |= NVME_TCP_F_DDGST; - nvmet_tcp_calc_ddgst(queue->snd_hash, cmd); + nvmet_tcp_calc_ddgst(cmd); } if (cmd->queue->hdr_digest) { pdu->hdr.flags |= NVME_TCP_F_HDGST; - nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvmet_tcp_hdgst(pdu, sizeof(*pdu)); } } static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) { struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; - struct nvmet_tcp_queue *queue = cmd->queue; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); cmd->offset = 0; @@ -503,14 +507,13 @@ static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); if (cmd->queue->hdr_digest) { pdu->hdr.flags |= NVME_TCP_F_HDGST; - nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvmet_tcp_hdgst(pdu, sizeof(*pdu)); } } static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) { struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; - struct nvmet_tcp_queue *queue = cmd->queue; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); cmd->offset = 0; @@ -523,7 +526,7 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); if (cmd->queue->hdr_digest) { pdu->hdr.flags |= NVME_TCP_F_HDGST; - nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvmet_tcp_hdgst(pdu, sizeof(*pdu)); } } @@ -857,42 +860,6 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU); } -static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); - - ahash_request_free(queue->rcv_hash); - ahash_request_free(queue->snd_hash); - crypto_free_ahash(tfm); -} - -static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue) -{ - struct crypto_ahash *tfm; - - tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->snd_hash) - goto free_tfm; - ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); - - queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->rcv_hash) - goto free_snd_hash; - ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); - - return 0; -free_snd_hash: - ahash_request_free(queue->snd_hash); -free_tfm: - crypto_free_ahash(tfm); - return -ENOMEM; -} - - static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) { struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; @@ -921,11 +888,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); - if (queue->hdr_digest || queue->data_digest) { - ret = nvmet_tcp_alloc_crypto(queue); - if (ret) - return ret; - } memset(icresp, 0, sizeof(*icresp)); icresp->hdr.type = nvme_tcp_icresp; @@ -1247,7 +1209,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) { struct nvmet_tcp_queue *queue = cmd->queue; - nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd); + nvmet_tcp_calc_ddgst(cmd); queue->offset = 0; queue->left = NVME_TCP_DIGEST_LENGTH; queue->rcv_state = NVMET_TCP_RECV_DDGST; @@ -1620,8 +1582,6 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) /* ->sock will be released by fput() */ fput(queue->sock->file); nvmet_tcp_free_cmds(queue); - if (queue->hdr_digest || queue->data_digest) - nvmet_tcp_free_crypto(queue); ida_free(&nvmet_tcp_queue_ida, queue->idx); page_frag_cache_drain(&queue->pf_cache); kfree(queue); From f791252b649653fe0477da79aa40c82d5bc407de Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 22 Apr 2025 11:15:56 +0200 Subject: [PATCH 06/42] nvme-auth: do not re-authenticate queues with no prior authentication When sending 'connect' the queues can figure out from the return code whether authentication is required or not. But reauthentication doesn't disconnect the queues, so this check is not available. Rather we need to check whether the queue had been authenticated initially to figure out if we need to reauthenticate. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 6115fef74c1e..f6ddbe553289 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -31,6 +31,7 @@ struct nvme_dhchap_queue_context { u32 s1; u32 s2; bool bi_directional; + bool authenticated; u16 transaction; u8 status; u8 dhgroup_id; @@ -682,6 +683,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) { nvme_auth_reset_dhchap(chap); + chap->authenticated = false; if (chap->shash_tfm) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) @@ -930,12 +932,14 @@ static void nvme_queue_auth_work(struct work_struct *work) } if (!ret) { chap->error = 0; + chap->authenticated = true; if (ctrl->opts->concat && (ret = nvme_auth_secure_concat(ctrl, chap))) { dev_warn(ctrl->device, "%s: qid %d failed to enable secure concatenation\n", __func__, chap->qid); chap->error = ret; + chap->authenticated = false; } return; } @@ -1023,13 +1027,16 @@ static void nvme_ctrl_auth_work(struct work_struct *work) return; for (q = 1; q < ctrl->queue_count; q++) { - ret = nvme_auth_negotiate(ctrl, q); - if (ret) { - dev_warn(ctrl->device, - "qid %d: error %d setting up authentication\n", - q, ret); - break; - } + struct nvme_dhchap_queue_context *chap = + &ctrl->dhchap_ctxs[q]; + /* + * Skip re-authentication if the queue had + * not been authenticated initially. + */ + if (!chap->authenticated) + continue; + cancel_work_sync(&chap->auth_work); + queue_work(nvme_auth_wq, &chap->auth_work); } /* @@ -1037,7 +1044,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work) * the controller terminates the connection. */ for (q = 1; q < ctrl->queue_count; q++) { - ret = nvme_auth_wait(ctrl, q); + struct nvme_dhchap_queue_context *chap = + &ctrl->dhchap_ctxs[q]; + if (!chap->authenticated) + continue; + flush_work(&chap->auth_work); + ret = chap->error; + nvme_auth_reset_dhchap(chap); if (ret) dev_warn(ctrl->device, "qid %d: authentication failed\n", q); @@ -1076,6 +1089,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) chap = &ctrl->dhchap_ctxs[i]; chap->qid = i; chap->ctrl = ctrl; + chap->authenticated = false; INIT_WORK(&chap->auth_work, nvme_queue_auth_work); } From c91a20129185d5153cd845c857b4f9fce61e28d1 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 22 Apr 2025 11:15:55 +0200 Subject: [PATCH 07/42] nvmet-auth: authenticate on admin queue only Do not start authentication on I/O queues as it doesn't really add value, and secure concatenation disallows it anyway. Authentication commands on I/O queues are not aborted, so the host may still run the authentication protocol on I/O queues. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/auth.c | 9 ++++++--- drivers/nvme/target/fabrics-cmd.c | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 9429b8218408..111dfaaa14a7 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -280,9 +280,12 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) bool nvmet_check_auth_status(struct nvmet_req *req) { - if (req->sq->ctrl->host_key && - !req->sq->authenticated) - return false; + if (req->sq->ctrl->host_key) { + if (req->sq->qid > 0) + return true; + if (!req->sq->authenticated) + return false; + } return true; } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index f012bdf89850..14f55192367e 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -239,8 +239,8 @@ static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) bool needs_auth = nvmet_has_auth(ctrl, sq); key_serial_t keyid = nvmet_queue_tls_keyid(sq); - /* Do not authenticate I/O queues for secure concatenation */ - if (ctrl->concat && sq->qid) + /* Do not authenticate I/O queues */ + if (sq->qid) needs_auth = false; if (keyid) From b3649f829a842f4f09a0912781936fc2f2d02253 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Thu, 24 Apr 2025 15:13:49 +1000 Subject: [PATCH 08/42] nvmet: add a helper function for cqid checking This patch adds a new helper function nvmet_check_io_cqid(). It is to be used when parsing host commands for IO CQ creation/deletion and IO SQ creation to ensure that the specified IO completion queue identifier (CQID) is not 0 (Admin queue ID). This is a check that already occurs in the nvmet_execute_x() functions prior to nvmet_check_cqid. With the addition of this helper function, the CQ ID checks in the nvmet_execute_x() function can be removed, and instead simply call nvmet_check_io_cqid() in place of nvmet_check_cqid(). Signed-off-by: Wilfred Mallawa Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 14 ++------------ drivers/nvme/target/core.c | 7 +++++++ drivers/nvme/target/nvmet.h | 1 + 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index acc138bbf8f2..753166fbb133 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -96,12 +96,7 @@ static void nvmet_execute_delete_cq(struct nvmet_req *req) goto complete; } - if (!cqid) { - status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; - goto complete; - } - - status = nvmet_check_cqid(ctrl, cqid); + status = nvmet_check_io_cqid(ctrl, cqid); if (status != NVME_SC_SUCCESS) goto complete; @@ -127,12 +122,7 @@ static void nvmet_execute_create_cq(struct nvmet_req *req) goto complete; } - if (!cqid) { - status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; - goto complete; - } - - status = nvmet_check_cqid(ctrl, cqid); + status = nvmet_check_io_cqid(ctrl, cqid); if (status != NVME_SC_SUCCESS) goto complete; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 245475c43127..3fc2b548b36f 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -859,6 +859,13 @@ u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid) return NVME_SC_SUCCESS; } +u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid) +{ + if (!cqid) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + return nvmet_check_cqid(ctrl, cqid); +} + u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size) { diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index b6db8b74dc4a..2f70db1284c9 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -572,6 +572,7 @@ void nvmet_execute_get_features(struct nvmet_req *req); void nvmet_execute_keep_alive(struct nvmet_req *req); u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid); +u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid); void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size); u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, From cbc5acdbbcf7dc11b64ab09efd21f6bd02d77d02 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Thu, 24 Apr 2025 15:13:50 +1000 Subject: [PATCH 09/42] nvmet: cq: prepare for completion queue sharing For the PCI transport, the NVMe specification allows submission queues to share completion queues, however, this is not supported in the current NVMe target implementation. This is a preparatory patch to allow for completion queue (CQ) sharing between different submission queues (SQ). To support queue sharing, reference counting completion queues is required. This patch adds the refcount_t field ref to struct nvmet_cq coupled with respective nvmet_cq_init(), nvmet_cq_get(), nvmet_cq_put(), nvmet_cq_is_deletable() and nvmet_cq_destroy() functions. A CQ reference count is initialized with nvmet_cq_init() when a CQ is created. Using nvmet_cq_get(), a reference to a CQ is taken when an SQ is created that uses the respective CQ. Similarly. when an SQ is destroyed, the reference count to the respective CQ from the SQ being destroyed is decremented with nvmet_cq_put(). The last reference to a CQ is dropped on a CQ deletion using nvmet_cq_put(), which invokes nvmet_cq_destroy() to fully cleanup after the CQ. The helper function nvmet_cq_in_use() is used to determine if any SQs are still using the CQ pending deletion. In which case, the CQ must not be deleted. This should protect scenarios where a bad host may attempt to delete a CQ without first having deleted SQ(s) using that CQ. Additionally, this patch adds an array of struct nvmet_cq to the nvmet_ctrl structure. This allows for the controller to keep track of CQs as they are created and destroyed, similar to the current tracking done for SQs. The memory for this array is freed when the controller is freed. A struct nvmet_ctrl reference is also added to the nvmet_cq structure to allow for CQs to be removed from the controller whilst keeping the new API similar to the existing API for SQs. Sample callchain with CQ refcounting for the PCI endpoint target (pci-epf): i. nvmet_execute_create_cq -> nvmet_pci_epf_create_cq -> nvmet_cq_create -> nvmet_cq_init [cq refcount=1] ii. nvmet_execute_create_sq -> nvmet_pci_epf_create_sq -> nvmet_sq_create -> nvmet_sq_init -> nvmet_cq_get [cq refcount=2] iii. nvmet_execute_delete_sq - > nvmet_pci_epf_delete_sq -> -> nvmet_sq_destroy -> nvmet_cq_put [cq refcount 1] iv. nvmet_execute_delete_cq -> nvmet_pci_epf_delete_cq -> nvmet_cq_put [cq refcount 0] Signed-off-by: Wilfred Mallawa Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 4 +- drivers/nvme/target/core.c | 68 +++++++++++++++++++++++++-------- drivers/nvme/target/nvmet.h | 12 +++++- drivers/nvme/target/pci-epf.c | 1 + 4 files changed, 66 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 753166fbb133..5e3699973d56 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -96,7 +96,7 @@ static void nvmet_execute_delete_cq(struct nvmet_req *req) goto complete; } - status = nvmet_check_io_cqid(ctrl, cqid); + status = nvmet_check_io_cqid(ctrl, cqid, false); if (status != NVME_SC_SUCCESS) goto complete; @@ -122,7 +122,7 @@ static void nvmet_execute_create_cq(struct nvmet_req *req) goto complete; } - status = nvmet_check_io_cqid(ctrl, cqid); + status = nvmet_check_io_cqid(ctrl, cqid, true); if (status != NVME_SC_SUCCESS) goto complete; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 3fc2b548b36f..bfaba79f971d 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -813,11 +813,43 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status) } EXPORT_SYMBOL_GPL(nvmet_req_complete); +void nvmet_cq_init(struct nvmet_cq *cq) +{ + refcount_set(&cq->ref, 1); +} +EXPORT_SYMBOL_GPL(nvmet_cq_init); + +bool nvmet_cq_get(struct nvmet_cq *cq) +{ + return refcount_inc_not_zero(&cq->ref); +} +EXPORT_SYMBOL_GPL(nvmet_cq_get); + +void nvmet_cq_put(struct nvmet_cq *cq) +{ + if (refcount_dec_and_test(&cq->ref)) + nvmet_cq_destroy(cq); +} +EXPORT_SYMBOL_GPL(nvmet_cq_put); + void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size) { cq->qid = qid; cq->size = size; + + ctrl->cqs[qid] = cq; +} + +void nvmet_cq_destroy(struct nvmet_cq *cq) +{ + struct nvmet_ctrl *ctrl = cq->ctrl; + + if (ctrl) { + ctrl->cqs[cq->qid] = NULL; + nvmet_ctrl_put(cq->ctrl); + cq->ctrl = NULL; + } } void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, @@ -837,41 +869,39 @@ static void nvmet_confirm_sq(struct percpu_ref *ref) complete(&sq->confirm_done); } -u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid) +u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create) { - if (!ctrl->sqs) + if (!ctrl->cqs) return NVME_SC_INTERNAL | NVME_STATUS_DNR; if (cqid > ctrl->subsys->max_qid) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; - /* - * Note: For PCI controllers, the NVMe specifications allows multiple - * SQs to share a single CQ. However, we do not support this yet, so - * check that there is no SQ defined for a CQ. If one exist, then the - * CQ ID is invalid for creation as well as when the CQ is being - * deleted (as that would mean that the SQ was not deleted before the - * CQ). - */ - if (ctrl->sqs[cqid]) + if ((create && ctrl->cqs[cqid]) || (!create && !ctrl->cqs[cqid])) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; return NVME_SC_SUCCESS; } -u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid) +u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create) { if (!cqid) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; - return nvmet_check_cqid(ctrl, cqid); + return nvmet_check_cqid(ctrl, cqid, create); } +bool nvmet_cq_in_use(struct nvmet_cq *cq) +{ + return refcount_read(&cq->ref) > 1; +} +EXPORT_SYMBOL_GPL(nvmet_cq_in_use); + u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size) { u16 status; - status = nvmet_check_cqid(ctrl, qid); + status = nvmet_check_cqid(ctrl, qid, true); if (status != NVME_SC_SUCCESS) return status; @@ -1619,12 +1649,17 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) if (!ctrl->sqs) goto out_free_changed_ns_list; + ctrl->cqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_cq *), + GFP_KERNEL); + if (!ctrl->cqs) + goto out_free_sqs; + ret = ida_alloc_range(&cntlid_ida, subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { args->status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; - goto out_free_sqs; + goto out_free_cqs; } ctrl->cntlid = ret; @@ -1683,6 +1718,8 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) mutex_unlock(&subsys->lock); nvmet_stop_keep_alive_timer(ctrl); ida_free(&cntlid_ida, ctrl->cntlid); +out_free_cqs: + kfree(ctrl->cqs); out_free_sqs: kfree(ctrl->sqs); out_free_changed_ns_list: @@ -1719,6 +1756,7 @@ static void nvmet_ctrl_free(struct kref *ref) nvmet_async_events_free(ctrl); kfree(ctrl->sqs); + kfree(ctrl->cqs); kfree(ctrl->changed_ns_list); kfree(ctrl); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 2f70db1284c9..c87f6fb458e8 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -141,8 +141,10 @@ static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns) } struct nvmet_cq { + struct nvmet_ctrl *ctrl; u16 qid; u16 size; + refcount_t ref; }; struct nvmet_sq { @@ -247,6 +249,7 @@ struct nvmet_pr_log_mgr { struct nvmet_ctrl { struct nvmet_subsys *subsys; struct nvmet_sq **sqs; + struct nvmet_cq **cqs; void *drvdata; @@ -571,12 +574,17 @@ void nvmet_execute_set_features(struct nvmet_req *req); void nvmet_execute_get_features(struct nvmet_req *req); void nvmet_execute_keep_alive(struct nvmet_req *req); -u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid); -u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid); +u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create); +u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create); +void nvmet_cq_init(struct nvmet_cq *cq); void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size); u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size); +void nvmet_cq_destroy(struct nvmet_cq *cq); +bool nvmet_cq_get(struct nvmet_cq *cq); +void nvmet_cq_put(struct nvmet_cq *cq); +bool nvmet_cq_in_use(struct nvmet_cq *cq); u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, bool create); void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, u16 size); diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index 7fab7f3d79b7..7dda4156d86c 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1346,6 +1346,7 @@ static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid) nvmet_pci_epf_drain_queue(cq); nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map); + tctrl->cqs[cqid] = NULL; return NVME_SC_SUCCESS; } From bb78836b3a7cad311ea40106de8891b18a318620 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Thu, 24 Apr 2025 15:13:51 +1000 Subject: [PATCH 10/42] nvmet: fabrics: add CQ init and destroy With struct nvmet_cq now having a reference count, this patch amends the target fabrics call chain to initialize and destroy/put a completion queue. Signed-off-by: Wilfred Mallawa Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd.c | 8 ++++++++ drivers/nvme/target/fc.c | 3 +++ drivers/nvme/target/loop.c | 13 +++++++++++-- drivers/nvme/target/rdma.c | 3 +++ drivers/nvme/target/tcp.c | 3 +++ 5 files changed, 28 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 14f55192367e..7b8d8b397802 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -208,6 +208,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; } + kref_get(&ctrl->ref); + old = cmpxchg(&req->cq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, opcode); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; + } + /* note: convert queue size from 0's-based value to 1's-based value */ nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 7b50130f10f6..7c2a4e2eb315 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -816,6 +816,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue); + nvmet_cq_init(&queue->nvme_cq); ret = nvmet_sq_init(&queue->nvme_sq); if (ret) goto out_fail_iodlist; @@ -826,6 +827,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, return queue; out_fail_iodlist: + nvmet_cq_put(&queue->nvme_cq); nvmet_fc_destroy_fcp_iodlist(assoc->tgtport, queue); destroy_workqueue(queue->work_q); out_free_queue: @@ -934,6 +936,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) flush_workqueue(queue->work_q); nvmet_sq_destroy(&queue->nvme_sq); + nvmet_cq_put(&queue->nvme_cq); nvmet_fc_tgt_q_put(queue); } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index d02b80803278..bbb3699c8686 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -275,6 +275,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) nvme_unquiesce_admin_queue(&ctrl->ctrl); nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + nvmet_cq_put(&ctrl->queues[0].nvme_cq); nvme_remove_admin_tag_set(&ctrl->ctrl); } @@ -304,6 +305,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl) for (i = 1; i < ctrl->ctrl.queue_count; i++) { clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + nvmet_cq_put(&ctrl->queues[i].nvme_cq); } ctrl->ctrl.queue_count = 1; /* @@ -329,9 +331,12 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl) for (i = 1; i <= nr_io_queues; i++) { ctrl->queues[i].ctrl = ctrl; + nvmet_cq_init(&ctrl->queues[i].nvme_cq); ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); - if (ret) + if (ret) { + nvmet_cq_put(&ctrl->queues[i].nvme_cq); goto out_destroy_queues; + } ctrl->ctrl.queue_count++; } @@ -362,9 +367,12 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) int error; ctrl->queues[0].ctrl = ctrl; + nvmet_cq_init(&ctrl->queues[0].nvme_cq); error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); - if (error) + if (error) { + nvmet_cq_put(&ctrl->queues[0].nvme_cq); return error; + } ctrl->ctrl.queue_count = 1; error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, @@ -403,6 +411,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_sq: nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + nvmet_cq_put(&ctrl->queues[0].nvme_cq); return error; } diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 2a4536ef6184..3ad9b4d1fad2 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1353,6 +1353,7 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) pr_debug("freeing queue %d\n", queue->idx); nvmet_sq_destroy(&queue->nvme_sq); + nvmet_cq_put(&queue->nvme_cq); nvmet_rdma_destroy_queue_ib(queue); if (!queue->nsrq) { @@ -1436,6 +1437,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, goto out_reject; } + nvmet_cq_init(&queue->nvme_cq); ret = nvmet_sq_init(&queue->nvme_sq); if (ret) { ret = NVME_RDMA_CM_NO_RSC; @@ -1517,6 +1519,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, out_destroy_sq: nvmet_sq_destroy(&queue->nvme_sq); out_free_queue: + nvmet_cq_put(&queue->nvme_cq); kfree(queue); out_reject: nvmet_rdma_cm_reject(cm_id, ret); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index e6997ce61027..4dacb6b40fd1 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1577,6 +1577,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) nvmet_sq_put_tls_key(&queue->nvme_sq); nvmet_tcp_uninit_data_in_cmds(queue); nvmet_sq_destroy(&queue->nvme_sq); + nvmet_cq_put(&queue->nvme_cq); cancel_work_sync(&queue->io_work); nvmet_tcp_free_cmd_data_in_buffers(queue); /* ->sock will be released by fput() */ @@ -1910,6 +1911,7 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, if (ret) goto out_ida_remove; + nvmet_cq_init(&queue->nvme_cq); ret = nvmet_sq_init(&queue->nvme_sq); if (ret) goto out_free_connect; @@ -1953,6 +1955,7 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, mutex_unlock(&nvmet_tcp_queue_mutex); nvmet_sq_destroy(&queue->nvme_sq); out_free_connect: + nvmet_cq_put(&queue->nvme_cq); nvmet_tcp_free_cmd(&queue->connect); out_ida_remove: ida_free(&nvmet_tcp_queue_ida, queue->idx); From 94ee8708c91f6640d968e3064ee806fe94f30463 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Thu, 24 Apr 2025 15:13:52 +1000 Subject: [PATCH 11/42] nvmet: support completion queue sharing The NVMe PCI transport specification allows for completion queues to be shared by different submission queues. This patch allows a submission queue to keep track of the completion queue it is using with reference counting. As such, it can be ensured that a completion queue is not deleted while a submission queue is actively using it. This patch enables completion queue sharing in the pci-epf target driver. For fabrics drivers, completion queue sharing is not enabled as it is not possible as per the fabrics specification. However, this patch modifies the fabrics drivers to correctly integrate the new API that supports completion queue sharing. Signed-off-by: Wilfred Mallawa Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 19 ++++++++++--------- drivers/nvme/target/core.c | 17 ++++++++++++++--- drivers/nvme/target/fc.c | 2 +- drivers/nvme/target/loop.c | 6 ++++-- drivers/nvme/target/nvmet.h | 9 +++++---- drivers/nvme/target/pci-epf.c | 12 +++++++----- drivers/nvme/target/rdma.c | 2 +- drivers/nvme/target/tcp.c | 2 +- 8 files changed, 43 insertions(+), 26 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 5e3699973d56..c7317299078d 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -63,14 +63,9 @@ static void nvmet_execute_create_sq(struct nvmet_req *req) if (status != NVME_SC_SUCCESS) goto complete; - /* - * Note: The NVMe specification allows multiple SQs to use the same CQ. - * However, the target code does not really support that. So for now, - * prevent this and fail the command if sqid and cqid are different. - */ - if (!cqid || cqid != sqid) { - pr_err("SQ %u: Unsupported CQID %u\n", sqid, cqid); - status = NVME_SC_CQ_INVALID | NVME_STATUS_DNR; + status = nvmet_check_io_cqid(ctrl, cqid, false); + if (status != NVME_SC_SUCCESS) { + pr_err("SQ %u: Invalid CQID %u\n", sqid, cqid); goto complete; } @@ -79,7 +74,7 @@ static void nvmet_execute_create_sq(struct nvmet_req *req) goto complete; } - status = ctrl->ops->create_sq(ctrl, sqid, sq_flags, qsize, prp1); + status = ctrl->ops->create_sq(ctrl, sqid, cqid, sq_flags, qsize, prp1); complete: nvmet_req_complete(req, status); @@ -100,6 +95,12 @@ static void nvmet_execute_delete_cq(struct nvmet_req *req) if (status != NVME_SC_SUCCESS) goto complete; + if (!ctrl->cqs[cqid] || nvmet_cq_in_use(ctrl->cqs[cqid])) { + /* Some SQs are still using this CQ */ + status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; + goto complete; + } + status = ctrl->ops->delete_cq(ctrl, cqid); complete: diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index bfaba79f971d..2b02b2f939a5 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -905,6 +905,11 @@ u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, if (status != NVME_SC_SUCCESS) return status; + if (!kref_get_unless_zero(&ctrl->ref)) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + cq->ctrl = ctrl; + + nvmet_cq_init(cq); nvmet_cq_setup(ctrl, cq, qid, size); return NVME_SC_SUCCESS; @@ -928,7 +933,7 @@ u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, } u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, - u16 sqid, u16 size) + struct nvmet_cq *cq, u16 sqid, u16 size) { u16 status; int ret; @@ -940,7 +945,7 @@ u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, if (status != NVME_SC_SUCCESS) return status; - ret = nvmet_sq_init(sq); + ret = nvmet_sq_init(sq, cq); if (ret) { status = NVME_SC_INTERNAL | NVME_STATUS_DNR; goto ctrl_put; @@ -972,6 +977,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) wait_for_completion(&sq->free_done); percpu_ref_exit(&sq->ref); nvmet_auth_sq_free(sq); + nvmet_cq_put(sq->cq); /* * we must reference the ctrl again after waiting for inflight IO @@ -1004,18 +1010,23 @@ static void nvmet_sq_free(struct percpu_ref *ref) complete(&sq->free_done); } -int nvmet_sq_init(struct nvmet_sq *sq) +int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq) { int ret; + if (!nvmet_cq_get(cq)) + return -EINVAL; + ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL); if (ret) { pr_err("percpu_ref init failed!\n"); + nvmet_cq_put(cq); return ret; } init_completion(&sq->free_done); init_completion(&sq->confirm_done); nvmet_auth_sq_init(sq); + sq->cq = cq; return 0; } diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 7c2a4e2eb315..2e813e51549c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -817,7 +817,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue); nvmet_cq_init(&queue->nvme_cq); - ret = nvmet_sq_init(&queue->nvme_sq); + ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) goto out_fail_iodlist; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index bbb3699c8686..c9ec13eff879 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -332,7 +332,8 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl) for (i = 1; i <= nr_io_queues; i++) { ctrl->queues[i].ctrl = ctrl; nvmet_cq_init(&ctrl->queues[i].nvme_cq); - ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq, + &ctrl->queues[i].nvme_cq); if (ret) { nvmet_cq_put(&ctrl->queues[i].nvme_cq); goto out_destroy_queues; @@ -368,7 +369,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->queues[0].ctrl = ctrl; nvmet_cq_init(&ctrl->queues[0].nvme_cq); - error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); + error = nvmet_sq_init(&ctrl->queues[0].nvme_sq, + &ctrl->queues[0].nvme_cq); if (error) { nvmet_cq_put(&ctrl->queues[0].nvme_cq); return error; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c87f6fb458e8..d3795b09fcc4 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -150,6 +150,7 @@ struct nvmet_cq { struct nvmet_sq { struct nvmet_ctrl *ctrl; struct percpu_ref ref; + struct nvmet_cq *cq; u16 qid; u16 size; u32 sqhd; @@ -427,7 +428,7 @@ struct nvmet_fabrics_ops { u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl); /* Operations mandatory for PCI target controllers */ - u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 flags, + u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 cqid, u16 flags, u16 qsize, u64 prp1); u16 (*delete_sq)(struct nvmet_ctrl *ctrl, u16 sqid); u16 (*create_cq)(struct nvmet_ctrl *ctrl, u16 cqid, u16 flags, @@ -588,10 +589,10 @@ bool nvmet_cq_in_use(struct nvmet_cq *cq); u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, bool create); void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, u16 size); -u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, - u16 size); +u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, + struct nvmet_cq *cq, u16 qid, u16 size); void nvmet_sq_destroy(struct nvmet_sq *sq); -int nvmet_sq_init(struct nvmet_sq *sq); +int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq); void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index 7dda4156d86c..ebe2fc355fbd 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1346,16 +1346,17 @@ static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid) nvmet_pci_epf_drain_queue(cq); nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map); - tctrl->cqs[cqid] = NULL; + nvmet_cq_put(&cq->nvme_cq); return NVME_SC_SUCCESS; } static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl, - u16 sqid, u16 flags, u16 qsize, u64 pci_addr) + u16 sqid, u16 cqid, u16 flags, u16 qsize, u64 pci_addr) { struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid]; + struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid]; u16 status; if (test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags)) @@ -1378,7 +1379,8 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl, sq->qes = ctrl->io_sqes; sq->pci_size = sq->qes * sq->depth; - status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth); + status = nvmet_sq_create(tctrl, &sq->nvme_sq, &cq->nvme_cq, sqid, + sq->depth); if (status != NVME_SC_SUCCESS) return status; @@ -1873,8 +1875,8 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) qsize = aqa & 0x00000fff; pci_addr = asq & GENMASK_ULL(63, 12); - status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG, - qsize, pci_addr); + status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, 0, + NVME_QUEUE_PHYS_CONTIG, qsize, pci_addr); if (status != NVME_SC_SUCCESS) { dev_err(ctrl->dev, "Failed to create admin submission queue\n"); nvmet_pci_epf_delete_cq(ctrl->tctrl, 0); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3ad9b4d1fad2..2e5c32298818 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1438,7 +1438,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, } nvmet_cq_init(&queue->nvme_cq); - ret = nvmet_sq_init(&queue->nvme_sq); + ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) { ret = NVME_RDMA_CM_NO_RSC; goto out_free_queue; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 4dacb6b40fd1..5811246dbe42 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1912,7 +1912,7 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, goto out_ida_remove; nvmet_cq_init(&queue->nvme_cq); - ret = nvmet_sq_init(&queue->nvme_sq); + ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) goto out_free_connect; From 87b4d5ec0dca44e316c37ca84cd00c31cc8e8e14 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Thu, 24 Apr 2025 15:13:53 +1000 Subject: [PATCH 12/42] nvmet: simplify the nvmet_req_init() interface Now that a submission queue holds a reference to its completion queue, there is no need to pass the cq argument to nvmet_req_init(), so remove it. Signed-off-by: Wilfred Mallawa Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 6 +++--- drivers/nvme/target/fc.c | 6 ++---- drivers/nvme/target/loop.c | 6 ++---- drivers/nvme/target/nvmet.h | 4 ++-- drivers/nvme/target/pci-epf.c | 3 +-- drivers/nvme/target/rdma.c | 3 +-- drivers/nvme/target/tcp.c | 3 +-- 7 files changed, 12 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 2b02b2f939a5..db7b17d1094e 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1156,13 +1156,13 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) return ret; } -bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, - struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops) +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq, + const struct nvmet_fabrics_ops *ops) { u8 flags = req->cmd->common.flags; u16 status; - req->cq = cq; + req->cq = sq->cq; req->sq = sq; req->ops = ops; req->sg = NULL; diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 2e813e51549c..a82cff9a8064 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2534,10 +2534,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, fod->data_sg = NULL; fod->data_sg_cnt = 0; - ret = nvmet_req_init(&fod->req, - &fod->queue->nvme_cq, - &fod->queue->nvme_sq, - &nvmet_fc_tgt_fcp_ops); + ret = nvmet_req_init(&fod->req, &fod->queue->nvme_sq, + &nvmet_fc_tgt_fcp_ops); if (!ret) { /* bad SQE content or invalid ctrl state */ /* nvmet layer has already called op done to send rsp. */ diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index c9ec13eff879..f85a8441bcc6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -150,8 +150,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_start_request(req); iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; iod->req.port = queue->ctrl->port; - if (!nvmet_req_init(&iod->req, &queue->nvme_cq, - &queue->nvme_sq, &nvme_loop_ops)) + if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops)) return BLK_STS_OK; if (blk_rq_nr_phys_segments(req)) { @@ -183,8 +182,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg) iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH; iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; - if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, - &nvme_loop_ops)) { + if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops)) { dev_err(ctrl->ctrl.device, "failed async event work\n"); return; } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index d3795b09fcc4..df69a9dee71c 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -561,8 +561,8 @@ u32 nvmet_fabrics_admin_cmd_data_len(struct nvmet_req *req); u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req); u32 nvmet_fabrics_io_cmd_data_len(struct nvmet_req *req); -bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, - struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops); +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq, + const struct nvmet_fabrics_ops *ops); void nvmet_req_uninit(struct nvmet_req *req); size_t nvmet_req_transfer_len(struct nvmet_req *req); bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len); diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index ebe2fc355fbd..ec529305bc75 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1597,8 +1597,7 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) goto complete; } - if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq, - &nvmet_pci_epf_fabrics_ops)) + if (!nvmet_req_init(req, &iod->sq->nvme_sq, &nvmet_pci_epf_fabrics_ops)) goto complete; iod->data_len = nvmet_req_transfer_len(req); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 2e5c32298818..432bdf7cd49e 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -976,8 +976,7 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, cmd->send_sge.addr, cmd->send_sge.length, DMA_TO_DEVICE); - if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, - &queue->nvme_sq, &nvmet_rdma_ops)) + if (!nvmet_req_init(&cmd->req, &queue->nvme_sq, &nvmet_rdma_ops)) return; status = nvmet_rdma_map_sgl(cmd); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 5811246dbe42..c6603bd9c95e 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1039,8 +1039,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) req = &queue->cmd->req; memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); - if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, - &queue->nvme_sq, &nvmet_tcp_ops))) { + if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, &nvmet_tcp_ops))) { pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n", req->cmd, req->cmd->common.command_id, req->cmd->common.opcode, From fee45888a3e445999dec66301797c768f7d16028 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 7 May 2025 10:28:17 +0200 Subject: [PATCH 13/42] nvme-auth: use SHASH_DESC_ON_STACK Use SHASH_DESC_ON_STACK to avoid explicit allocation. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/common/auth.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 2c092ec8c0a9..3b6d759bcdf2 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -242,7 +242,7 @@ struct nvme_dhchap_key *nvme_auth_transform_key( { const char *hmac_name; struct crypto_shash *key_tfm; - struct shash_desc *shash; + SHASH_DESC_ON_STACK(shash, key_tfm); struct nvme_dhchap_key *transformed_key; int ret, key_len; @@ -267,19 +267,11 @@ struct nvme_dhchap_key *nvme_auth_transform_key( if (IS_ERR(key_tfm)) return ERR_CAST(key_tfm); - shash = kmalloc(sizeof(struct shash_desc) + - crypto_shash_descsize(key_tfm), - GFP_KERNEL); - if (!shash) { - ret = -ENOMEM; - goto out_free_key; - } - key_len = crypto_shash_digestsize(key_tfm); transformed_key = nvme_auth_alloc_key(key_len, key->hash); if (!transformed_key) { ret = -ENOMEM; - goto out_free_shash; + goto out_free_key; } shash->tfm = key_tfm; @@ -299,15 +291,12 @@ struct nvme_dhchap_key *nvme_auth_transform_key( if (ret < 0) goto out_free_transformed_key; - kfree(shash); crypto_free_shash(key_tfm); return transformed_key; out_free_transformed_key: nvme_auth_free_key(transformed_key); -out_free_shash: - kfree(shash); out_free_key: crypto_free_shash(key_tfm); From 6b262697dafeb8d558f9ddb2207159ec770e213e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 7 May 2025 10:28:18 +0200 Subject: [PATCH 14/42] nvmet-auth: use SHASH_DESC_ON_STACK Use SHASH_DESC_ON_STACK to avoid explicit allocation. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig --- drivers/nvme/target/auth.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 111dfaaa14a7..b340380f3892 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -293,7 +293,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, unsigned int shash_len) { struct crypto_shash *shash_tfm; - struct shash_desc *shash; + SHASH_DESC_ON_STACK(shash, shash_tfm); struct nvmet_ctrl *ctrl = req->sq->ctrl; const char *hash_name; u8 *challenge = req->sq->dhchap_c1; @@ -345,19 +345,13 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, req->sq->dhchap_c1, challenge, shash_len); if (ret) - goto out_free_challenge; + goto out; } pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, req->sq->dhchap_tid); - shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), - GFP_KERNEL); - if (!shash) { - ret = -ENOMEM; - goto out_free_challenge; - } shash->tfm = shash_tfm; ret = crypto_shash_init(shash); if (ret) @@ -392,8 +386,6 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, goto out; ret = crypto_shash_final(shash, response); out: - kfree(shash); -out_free_challenge: if (challenge != req->sq->dhchap_c1) kfree(challenge); out_free_response: From d6c40d87e7fed4b10f2fe93c90487145e2622ebf Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:22:57 +0200 Subject: [PATCH 15/42] nvmet-fcloop: track ref counts for nports A nport object is always used in association with targerport, remoteport, tport and rport objects. Add explicit references for any of the associated object. This ensures that nport is not removed too early on shutdown sequences. Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 135 ++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 641201e62c1b..a0828626db07 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1047,8 +1047,14 @@ static void fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) { struct fcloop_rport *rport = remoteport->private; + unsigned long flags; flush_work(&rport->ls_work); + + spin_lock_irqsave(&fcloop_lock, flags); + rport->nport->rport = NULL; + spin_unlock_irqrestore(&fcloop_lock, flags); + fcloop_nport_put(rport->nport); } @@ -1056,8 +1062,14 @@ static void fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) { struct fcloop_tport *tport = targetport->private; + unsigned long flags; flush_work(&tport->ls_work); + + spin_lock_irqsave(&fcloop_lock, flags); + tport->nport->tport = NULL; + spin_unlock_irqrestore(&fcloop_lock, flags); + fcloop_nport_put(tport->nport); } @@ -1184,6 +1196,37 @@ __wait_localport_unreg(struct fcloop_lport *lport) return ret; } +static struct fcloop_nport * +__fcloop_nport_lookup(u64 node_name, u64 port_name) +{ + struct fcloop_nport *nport; + + list_for_each_entry(nport, &fcloop_nports, nport_list) { + if (nport->node_name != node_name || + nport->port_name != port_name) + continue; + + if (fcloop_nport_get(nport)) + return nport; + + break; + } + + return NULL; +} + +static struct fcloop_nport * +fcloop_nport_lookup(u64 node_name, u64 port_name) +{ + struct fcloop_nport *nport; + unsigned long flags; + + spin_lock_irqsave(&fcloop_lock, flags); + nport = __fcloop_nport_lookup(node_name, port_name); + spin_unlock_irqrestore(&fcloop_lock, flags); + + return nport; +} static ssize_t fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, @@ -1365,6 +1408,8 @@ __unlink_remote_port(struct fcloop_nport *nport) { struct fcloop_rport *rport = nport->rport; + lockdep_assert_held(&fcloop_lock); + if (rport && nport->tport) nport->tport->remoteport = NULL; nport->rport = NULL; @@ -1377,9 +1422,6 @@ __unlink_remote_port(struct fcloop_nport *nport) static int __remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport) { - if (!rport) - return -EALREADY; - return nvme_fc_unregister_remoteport(rport->remoteport); } @@ -1387,8 +1429,8 @@ static ssize_t fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct fcloop_nport *nport = NULL, *tmpport; - static struct fcloop_rport *rport; + struct fcloop_nport *nport; + struct fcloop_rport *rport; u64 nodename, portname; unsigned long flags; int ret; @@ -1397,24 +1439,24 @@ fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr, if (ret) return ret; - spin_lock_irqsave(&fcloop_lock, flags); - - list_for_each_entry(tmpport, &fcloop_nports, nport_list) { - if (tmpport->node_name == nodename && - tmpport->port_name == portname && tmpport->rport) { - nport = tmpport; - rport = __unlink_remote_port(nport); - break; - } - } + nport = fcloop_nport_lookup(nodename, portname); + if (!nport) + return -ENOENT; + spin_lock_irqsave(&fcloop_lock, flags); + rport = __unlink_remote_port(nport); spin_unlock_irqrestore(&fcloop_lock, flags); - if (!nport) - return -ENOENT; + if (!rport) { + ret = -ENOENT; + goto out_nport_put; + } ret = __remoteport_unreg(nport, rport); +out_nport_put: + fcloop_nport_put(nport); + return ret ? ret : count; } @@ -1465,6 +1507,8 @@ __unlink_target_port(struct fcloop_nport *nport) { struct fcloop_tport *tport = nport->tport; + lockdep_assert_held(&fcloop_lock); + if (tport && nport->rport) nport->rport->targetport = NULL; nport->tport = NULL; @@ -1475,9 +1519,6 @@ __unlink_target_port(struct fcloop_nport *nport) static int __targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport) { - if (!tport) - return -EALREADY; - return nvmet_fc_unregister_targetport(tport->targetport); } @@ -1485,8 +1526,8 @@ static ssize_t fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct fcloop_nport *nport = NULL, *tmpport; - struct fcloop_tport *tport = NULL; + struct fcloop_nport *nport; + struct fcloop_tport *tport; u64 nodename, portname; unsigned long flags; int ret; @@ -1495,24 +1536,24 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, if (ret) return ret; - spin_lock_irqsave(&fcloop_lock, flags); - - list_for_each_entry(tmpport, &fcloop_nports, nport_list) { - if (tmpport->node_name == nodename && - tmpport->port_name == portname && tmpport->tport) { - nport = tmpport; - tport = __unlink_target_port(nport); - break; - } - } + nport = fcloop_nport_lookup(nodename, portname); + if (!nport) + return -ENOENT; + spin_lock_irqsave(&fcloop_lock, flags); + tport = __unlink_target_port(nport); spin_unlock_irqrestore(&fcloop_lock, flags); - if (!nport) - return -ENOENT; + if (!tport) { + ret = -ENOENT; + goto out_nport_put; + } ret = __targetport_unreg(nport, tport); +out_nport_put: + fcloop_nport_put(nport); + return ret ? ret : count; } @@ -1609,8 +1650,8 @@ static int __init fcloop_init(void) static void __exit fcloop_exit(void) { - struct fcloop_lport *lport = NULL; - struct fcloop_nport *nport = NULL; + struct fcloop_lport *lport; + struct fcloop_nport *nport; struct fcloop_tport *tport; struct fcloop_rport *rport; unsigned long flags; @@ -1621,7 +1662,7 @@ static void __exit fcloop_exit(void) for (;;) { nport = list_first_entry_or_null(&fcloop_nports, typeof(*nport), nport_list); - if (!nport) + if (!nport || !fcloop_nport_get(nport)) break; tport = __unlink_target_port(nport); @@ -1629,13 +1670,21 @@ static void __exit fcloop_exit(void) spin_unlock_irqrestore(&fcloop_lock, flags); - ret = __targetport_unreg(nport, tport); - if (ret) - pr_warn("%s: Failed deleting target port\n", __func__); + if (tport) { + ret = __targetport_unreg(nport, tport); + if (ret) + pr_warn("%s: Failed deleting target port\n", + __func__); + } - ret = __remoteport_unreg(nport, rport); - if (ret) - pr_warn("%s: Failed deleting remote port\n", __func__); + if (rport) { + ret = __remoteport_unreg(nport, rport); + if (ret) + pr_warn("%s: Failed deleting remote port\n", + __func__); + } + + fcloop_nport_put(nport); spin_lock_irqsave(&fcloop_lock, flags); } From b999efc8cf41420f9d9f8b08c111ad80e49b274e Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:22:58 +0200 Subject: [PATCH 16/42] nvmet-fcloop: remove nport from list on last user The nport object has an association with the rport and lport object, that means we can only remove an nport object from the global nport_list after the last user of an rport or lport is gone. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index a0828626db07..7763b8bf098d 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1019,9 +1019,15 @@ fcloop_lport_get(struct fcloop_lport *lport) static void fcloop_nport_put(struct fcloop_nport *nport) { + unsigned long flags; + if (!refcount_dec_and_test(&nport->ref)) return; + spin_lock_irqsave(&fcloop_lock, flags); + list_del(&nport->nport_list); + spin_unlock_irqrestore(&fcloop_lock, flags); + kfree(nport); } @@ -1414,8 +1420,6 @@ __unlink_remote_port(struct fcloop_nport *nport) nport->tport->remoteport = NULL; nport->rport = NULL; - list_del(&nport->nport_list); - return rport; } From d54a9d7f6d74ee385d496f0efd282ce9d4fbfcbc Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:22:59 +0200 Subject: [PATCH 17/42] nvmet-fcloop: refactor fcloop_nport_alloc and track lport The checks for a valid input values are mixed with the logic to insert a newly allocated nport. Refactor the function so that first the checks are done. This allows to untangle the setup steps into a more linear form which reduces the complexity of the functions. Also start tracking lport when a lport is assigned to a nport. This ensures, that the lport is not going away as long it is still referenced by a nport. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 107 +++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 7763b8bf098d..a576ad9e1dea 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1028,6 +1028,9 @@ fcloop_nport_put(struct fcloop_nport *nport) list_del(&nport->nport_list); spin_unlock_irqrestore(&fcloop_lock, flags); + if (nport->lport) + fcloop_lport_put(nport->lport); + kfree(nport); } @@ -1234,6 +1237,25 @@ fcloop_nport_lookup(u64 node_name, u64 port_name) return nport; } +static struct fcloop_lport * +__fcloop_lport_lookup(u64 node_name, u64 port_name) +{ + struct fcloop_lport *lport; + + list_for_each_entry(lport, &fcloop_lports, lport_list) { + if (lport->localport->node_name != node_name || + lport->localport->port_name != port_name) + continue; + + if (fcloop_lport_get(lport)) + return lport; + + break; + } + + return NULL; +} + static ssize_t fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -1272,8 +1294,8 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, static struct fcloop_nport * fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) { - struct fcloop_nport *newnport, *nport = NULL; - struct fcloop_lport *tmplport, *lport = NULL; + struct fcloop_nport *newnport, *nport; + struct fcloop_lport *lport; struct fcloop_ctrl_options *opts; unsigned long flags; u32 opts_mask = (remoteport) ? RPORT_OPTS : TGTPORT_OPTS; @@ -1288,10 +1310,8 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) goto out_free_opts; /* everything there ? */ - if ((opts->mask & opts_mask) != opts_mask) { - ret = -EINVAL; + if ((opts->mask & opts_mask) != opts_mask) goto out_free_opts; - } newnport = kzalloc(sizeof(*newnport), GFP_KERNEL); if (!newnport) @@ -1307,60 +1327,61 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) refcount_set(&newnport->ref, 1); spin_lock_irqsave(&fcloop_lock, flags); - - list_for_each_entry(tmplport, &fcloop_lports, lport_list) { - if (tmplport->localport->node_name == opts->wwnn && - tmplport->localport->port_name == opts->wwpn) - goto out_invalid_opts; - - if (tmplport->localport->node_name == opts->lpwwnn && - tmplport->localport->port_name == opts->lpwwpn) - lport = tmplport; + lport = __fcloop_lport_lookup(opts->wwnn, opts->wwpn); + if (lport) { + /* invalid configuration */ + fcloop_lport_put(lport); + goto out_free_newnport; } if (remoteport) { - if (!lport) - goto out_invalid_opts; - newnport->lport = lport; - } - - list_for_each_entry(nport, &fcloop_nports, nport_list) { - if (nport->node_name == opts->wwnn && - nport->port_name == opts->wwpn) { - if ((remoteport && nport->rport) || - (!remoteport && nport->tport)) { - nport = NULL; - goto out_invalid_opts; - } - - fcloop_nport_get(nport); - - spin_unlock_irqrestore(&fcloop_lock, flags); - - if (remoteport) - nport->lport = lport; - if (opts->mask & NVMF_OPT_ROLES) - nport->port_role = opts->roles; - if (opts->mask & NVMF_OPT_FCADDR) - nport->port_id = opts->fcaddr; + lport = __fcloop_lport_lookup(opts->lpwwnn, opts->lpwwpn); + if (!lport) { + /* invalid configuration */ goto out_free_newnport; } } - list_add_tail(&newnport->nport_list, &fcloop_nports); + nport = __fcloop_nport_lookup(opts->wwnn, opts->wwpn); + if (nport) { + if ((remoteport && nport->rport) || + (!remoteport && nport->tport)) { + /* invalid configuration */ + goto out_put_nport; + } + /* found existing nport, discard the new nport */ + kfree(newnport); + } else { + list_add_tail(&newnport->nport_list, &fcloop_nports); + nport = newnport; + } + + if (opts->mask & NVMF_OPT_ROLES) + nport->port_role = opts->roles; + if (opts->mask & NVMF_OPT_FCADDR) + nport->port_id = opts->fcaddr; + if (lport) { + if (!nport->lport) + nport->lport = lport; + else + fcloop_lport_put(lport); + } spin_unlock_irqrestore(&fcloop_lock, flags); kfree(opts); - return newnport; + return nport; -out_invalid_opts: - spin_unlock_irqrestore(&fcloop_lock, flags); +out_put_nport: + if (lport) + fcloop_lport_put(lport); + fcloop_nport_put(nport); out_free_newnport: + spin_unlock_irqrestore(&fcloop_lock, flags); kfree(newnport); out_free_opts: kfree(opts); - return nport; + return NULL; } static ssize_t From fbaed6a810a3c4aa68fe3d486608469d15c9d4e8 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:00 +0200 Subject: [PATCH 18/42] nvmet-fcloop: refactor fcloop_delete_local_port Use the newly introduced fcloop_lport_lookup instead of the open coded version. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index a576ad9e1dea..a1ceeca264be 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1256,32 +1256,32 @@ __fcloop_lport_lookup(u64 node_name, u64 port_name) return NULL; } +static struct fcloop_lport * +fcloop_lport_lookup(u64 node_name, u64 port_name) +{ + struct fcloop_lport *lport; + unsigned long flags; + + spin_lock_irqsave(&fcloop_lock, flags); + lport = __fcloop_lport_lookup(node_name, port_name); + spin_unlock_irqrestore(&fcloop_lock, flags); + + return lport; +} + static ssize_t fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct fcloop_lport *tlport, *lport = NULL; + struct fcloop_lport *lport; u64 nodename, portname; - unsigned long flags; int ret; ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf); if (ret) return ret; - spin_lock_irqsave(&fcloop_lock, flags); - - list_for_each_entry(tlport, &fcloop_lports, lport_list) { - if (tlport->localport->node_name == nodename && - tlport->localport->port_name == portname) { - if (!fcloop_lport_get(tlport)) - break; - lport = tlport; - break; - } - } - spin_unlock_irqrestore(&fcloop_lock, flags); - + lport = fcloop_lport_lookup(nodename, portname); if (!lport) return -ENOENT; From 88ea8f814d8d006e78a4986a8c9e910788c8a5ec Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:01 +0200 Subject: [PATCH 19/42] nvmet-fcloop: update refs on tfcp_req Track the lifetime of the in-flight tfcp_req to ensure the object is not freed too early. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index a1ceeca264be..8af809ce963a 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -566,7 +566,8 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq, } /* release original io reference on tgt struct */ - fcloop_tfcp_req_put(tfcp_req); + if (tfcp_req) + fcloop_tfcp_req_put(tfcp_req); } static bool drop_fabric_opcode; @@ -671,6 +672,7 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) break; default: spin_unlock_irqrestore(&tfcp_req->reqlock, flags); + fcloop_tfcp_req_put(tfcp_req); WARN_ON(1); return; } @@ -958,8 +960,10 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, spin_lock(&inireq->inilock); tfcp_req = inireq->tfcp_req; - if (tfcp_req) - fcloop_tfcp_req_get(tfcp_req); + if (tfcp_req) { + if (!fcloop_tfcp_req_get(tfcp_req)) + tfcp_req = NULL; + } spin_unlock(&inireq->inilock); if (!tfcp_req) From 47a827cd7929d0550c3496d70b417fcb5649b27b Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:03 +0200 Subject: [PATCH 20/42] nvmet-fcloop: access fcpreq only when holding reqlock The abort handling logic expects that the state and the fcpreq are only accessed when holding the reqlock lock. While at it, only handle the aborts in the abort handler. Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 8af809ce963a..f0c0d95b8dde 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -619,12 +619,13 @@ fcloop_fcp_recv_work(struct work_struct *work) { struct fcloop_fcpreq *tfcp_req = container_of(work, struct fcloop_fcpreq, fcp_rcv_work); - struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq; + struct nvmefc_fcp_req *fcpreq; unsigned long flags; int ret = 0; bool aborted = false; spin_lock_irqsave(&tfcp_req->reqlock, flags); + fcpreq = tfcp_req->fcpreq; switch (tfcp_req->inistate) { case INI_IO_START: tfcp_req->inistate = INI_IO_ACTIVE; @@ -639,16 +640,19 @@ fcloop_fcp_recv_work(struct work_struct *work) } spin_unlock_irqrestore(&tfcp_req->reqlock, flags); - if (unlikely(aborted)) - ret = -ECANCELED; - else { - if (likely(!check_for_drop(tfcp_req))) - ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, - &tfcp_req->tgt_fcp_req, - fcpreq->cmdaddr, fcpreq->cmdlen); - else - pr_info("%s: dropped command ********\n", __func__); + if (unlikely(aborted)) { + /* the abort handler will call fcloop_call_host_done */ + return; + } + + if (unlikely(check_for_drop(tfcp_req))) { + pr_info("%s: dropped command ********\n", __func__); + return; } + + ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, + &tfcp_req->tgt_fcp_req, + fcpreq->cmdaddr, fcpreq->cmdlen); if (ret) fcloop_call_host_done(fcpreq, tfcp_req, ret); } @@ -663,9 +667,10 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) unsigned long flags; spin_lock_irqsave(&tfcp_req->reqlock, flags); - fcpreq = tfcp_req->fcpreq; switch (tfcp_req->inistate) { case INI_IO_ABORTED: + fcpreq = tfcp_req->fcpreq; + tfcp_req->fcpreq = NULL; break; case INI_IO_COMPLETED: completed = true; @@ -688,10 +693,6 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, &tfcp_req->tgt_fcp_req); - spin_lock_irqsave(&tfcp_req->reqlock, flags); - tfcp_req->fcpreq = NULL; - spin_unlock_irqrestore(&tfcp_req->reqlock, flags); - fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); /* call_host_done releases reference for abort downcall */ } From 2b559a3eb56b6e1a51ca6f4a17778a1b4e14a591 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:04 +0200 Subject: [PATCH 21/42] nvmet-fcloop: prevent double port deletion The delete callback can be called either via the unregister function or from the transport directly. Thus it is necessary ensure resources are not freed multiple times. Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index f0c0d95b8dde..52f0061920a8 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -215,6 +215,9 @@ struct fcloop_lport_priv { struct fcloop_lport *lport; }; +/* The port is already being removed, avoid double free */ +#define PORT_DELETED 0 + struct fcloop_rport { struct nvme_fc_remote_port *remoteport; struct nvmet_fc_target_port *targetport; @@ -223,6 +226,7 @@ struct fcloop_rport { spinlock_t lock; struct list_head ls_list; struct work_struct ls_work; + unsigned long flags; }; struct fcloop_tport { @@ -233,6 +237,7 @@ struct fcloop_tport { spinlock_t lock; struct list_head ls_list; struct work_struct ls_work; + unsigned long flags; }; struct fcloop_nport { @@ -1061,30 +1066,38 @@ static void fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) { struct fcloop_rport *rport = remoteport->private; + bool put_port = false; unsigned long flags; flush_work(&rport->ls_work); spin_lock_irqsave(&fcloop_lock, flags); + if (!test_and_set_bit(PORT_DELETED, &rport->flags)) + put_port = true; rport->nport->rport = NULL; spin_unlock_irqrestore(&fcloop_lock, flags); - fcloop_nport_put(rport->nport); + if (put_port) + fcloop_nport_put(rport->nport); } static void fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) { struct fcloop_tport *tport = targetport->private; + bool put_port = false; unsigned long flags; flush_work(&tport->ls_work); spin_lock_irqsave(&fcloop_lock, flags); + if (!test_and_set_bit(PORT_DELETED, &tport->flags)) + put_port = true; tport->nport->tport = NULL; spin_unlock_irqrestore(&fcloop_lock, flags); - fcloop_nport_put(tport->nport); + if (put_port) + fcloop_nport_put(tport->nport); } #define FCLOOP_HW_QUEUES 4 @@ -1427,6 +1440,7 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr, rport->nport = nport; rport->lport = nport->lport; nport->rport = rport; + rport->flags = 0; spin_lock_init(&rport->lock); INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work); INIT_LIST_HEAD(&rport->ls_list); @@ -1524,6 +1538,7 @@ fcloop_create_target_port(struct device *dev, struct device_attribute *attr, tport->nport = nport; tport->lport = nport->lport; nport->tport = tport; + tport->flags = 0; spin_lock_init(&tport->lock); INIT_WORK(&tport->ls_work, fcloop_tport_lsrqst_work); INIT_LIST_HEAD(&tport->ls_list); From 772042dd38eeb9caaed1476a873cd8359e893775 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:05 +0200 Subject: [PATCH 22/42] nvmet-fcloop: allocate/free fcloop_lsreq directly fcloop depends on the host or the target to allocate the fcloop_lsreq object. This means that the lifetime of the fcloop_lsreq is tied to either the host or the target. Consequently, the host or the target must cooperate during shutdown. Unfortunately, this approach does not work well when the target forces a shutdown, as there are dependencies that are difficult to resolve in a clean way. The simplest solution is to decouple the lifetime of the fcloop_lsreq object by managing them directly within fcloop. Since this is not a performance-critical path and only a small number of LS objects are used during setup and cleanup, it does not significantly impact performance to allocate them during normal operation. Reviewed-by: Hannes Reinecke Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 63 +++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 52f0061920a8..b952672f38cb 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -293,6 +293,9 @@ struct fcloop_ini_fcpreq { spinlock_t inilock; }; +/* SLAB cache for fcloop_lsreq structures */ +static struct kmem_cache *lsreq_cache; + static inline struct fcloop_lsreq * ls_rsp_to_lsreq(struct nvmefc_ls_rsp *lsrsp) { @@ -343,6 +346,7 @@ fcloop_rport_lsrqst_work(struct work_struct *work) * callee may free memory containing tls_req. * do not reference lsreq after this. */ + kmem_cache_free(lsreq_cache, tls_req); spin_lock(&rport->lock); } @@ -354,10 +358,13 @@ fcloop_h2t_ls_req(struct nvme_fc_local_port *localport, struct nvme_fc_remote_port *remoteport, struct nvmefc_ls_req *lsreq) { - struct fcloop_lsreq *tls_req = lsreq->private; struct fcloop_rport *rport = remoteport->private; + struct fcloop_lsreq *tls_req; int ret = 0; + tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL); + if (!tls_req) + return -ENOMEM; tls_req->lsreq = lsreq; INIT_LIST_HEAD(&tls_req->ls_list); @@ -394,14 +401,17 @@ fcloop_h2t_xmt_ls_rsp(struct nvmet_fc_target_port *targetport, lsrsp->done(lsrsp); - if (remoteport) { - rport = remoteport->private; - spin_lock(&rport->lock); - list_add_tail(&tls_req->ls_list, &rport->ls_list); - spin_unlock(&rport->lock); - queue_work(nvmet_wq, &rport->ls_work); + if (!remoteport) { + kmem_cache_free(lsreq_cache, tls_req); + return 0; } + rport = remoteport->private; + spin_lock(&rport->lock); + list_add_tail(&tls_req->ls_list, &rport->ls_list); + spin_unlock(&rport->lock); + queue_work(nvmet_wq, &rport->ls_work); + return 0; } @@ -427,6 +437,7 @@ fcloop_tport_lsrqst_work(struct work_struct *work) * callee may free memory containing tls_req. * do not reference lsreq after this. */ + kmem_cache_free(lsreq_cache, tls_req); spin_lock(&tport->lock); } @@ -437,8 +448,8 @@ static int fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle, struct nvmefc_ls_req *lsreq) { - struct fcloop_lsreq *tls_req = lsreq->private; struct fcloop_tport *tport = targetport->private; + struct fcloop_lsreq *tls_req; int ret = 0; /* @@ -446,6 +457,10 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle, * hosthandle ignored as fcloop currently is * 1:1 tgtport vs remoteport */ + + tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL); + if (!tls_req) + return -ENOMEM; tls_req->lsreq = lsreq; INIT_LIST_HEAD(&tls_req->ls_list); @@ -462,6 +477,9 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle, ret = nvme_fc_rcv_ls_req(tport->remoteport, &tls_req->ls_rsp, lsreq->rqstaddr, lsreq->rqstlen); + if (ret) + kmem_cache_free(lsreq_cache, tls_req); + return ret; } @@ -481,14 +499,17 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport, lsreq->rsplen : lsrsp->rsplen)); lsrsp->done(lsrsp); - if (targetport) { - tport = targetport->private; - spin_lock(&tport->lock); - list_add_tail(&tls_req->ls_list, &tport->ls_list); - spin_unlock(&tport->lock); - queue_work(nvmet_wq, &tport->ls_work); + if (!targetport) { + kmem_cache_free(lsreq_cache, tls_req); + return 0; } + tport = targetport->private; + spin_lock(&tport->lock); + list_add_tail(&tls_req->ls_list, &tport->ls_list); + spin_unlock(&tport->lock); + queue_work(nvmet_wq, &tport->ls_work); + return 0; } @@ -1121,7 +1142,6 @@ static struct nvme_fc_port_template fctemplate = { /* sizes of additional private data for data structures */ .local_priv_sz = sizeof(struct fcloop_lport_priv), .remote_priv_sz = sizeof(struct fcloop_rport), - .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq), }; @@ -1144,7 +1164,6 @@ static struct nvmet_fc_target_template tgttemplate = { .target_features = 0, /* sizes of additional private data for data structures */ .target_priv_sz = sizeof(struct fcloop_tport), - .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), }; static ssize_t @@ -1664,15 +1683,20 @@ static const struct class fcloop_class = { }; static struct device *fcloop_device; - static int __init fcloop_init(void) { int ret; + lsreq_cache = kmem_cache_create("lsreq_cache", + sizeof(struct fcloop_lsreq), 0, + 0, NULL); + if (!lsreq_cache) + return -ENOMEM; + ret = class_register(&fcloop_class); if (ret) { pr_err("couldn't register class fcloop\n"); - return ret; + goto out_destroy_cache; } fcloop_device = device_create_with_groups( @@ -1690,6 +1714,8 @@ static int __init fcloop_init(void) out_destroy_class: class_unregister(&fcloop_class); +out_destroy_cache: + kmem_cache_destroy(lsreq_cache); return ret; } @@ -1757,6 +1783,7 @@ static void __exit fcloop_exit(void) device_destroy(&fcloop_class, MKDEV(0, 0)); class_unregister(&fcloop_class); + kmem_cache_destroy(lsreq_cache); } module_init(fcloop_init); From 84eedced1c5b84fe4f9740e4594b2dc99b569388 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:07 +0200 Subject: [PATCH 23/42] nvmet-fcloop: drop response if targetport is gone When the target port is gone, the lsrsp pointer is invalid. Thus don't call the done function anymore instead just drop the response. This happens when the target sends a disconnect association. After this the target starts tearing down all resources and doesn't expect any response. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index b952672f38cb..4c60eaf0b653 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -494,16 +494,25 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport, struct nvmet_fc_target_port *targetport = rport->targetport; struct fcloop_tport *tport; - memcpy(lsreq->rspaddr, lsrsp->rspbuf, - ((lsreq->rsplen < lsrsp->rsplen) ? - lsreq->rsplen : lsrsp->rsplen)); - lsrsp->done(lsrsp); - if (!targetport) { + /* + * The target port is gone. The target doesn't expect any + * response anymore and the ->done call is not valid + * because the resources have been freed by + * nvmet_fc_free_pending_reqs. + * + * We end up here from delete association exchange: + * nvmet_fc_xmt_disconnect_assoc sends an async request. + */ kmem_cache_free(lsreq_cache, tls_req); return 0; } + memcpy(lsreq->rspaddr, lsrsp->rspbuf, + ((lsreq->rsplen < lsrsp->rsplen) ? + lsreq->rsplen : lsrsp->rsplen)); + lsrsp->done(lsrsp); + tport = targetport->private; spin_lock(&tport->lock); list_add_tail(&tls_req->ls_list, &tport->ls_list); From bbccbf791e6ffea86877eafb7489a63c6f7aef6d Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:08 +0200 Subject: [PATCH 24/42] nvmet-fc: free pending reqs on tgtport unregister When nvmet_fc_unregister_targetport is called by the LLDD, it's not possible to communicate with the host, thus all pending request will not be process. Thus explicitly free them. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 41 +++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index a82cff9a8064..b8985559d1f1 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1583,6 +1583,39 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); } +static void +nvmet_fc_free_pending_reqs(struct nvmet_fc_tgtport *tgtport) +{ + struct nvmet_fc_ls_req_op *lsop; + struct nvmefc_ls_req *lsreq; + struct nvmet_fc_ls_iod *iod; + int i; + + iod = tgtport->iod; + for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++) + cancel_work(&iod->work); + + /* + * After this point the connection is lost and thus any pending + * request can't be processed by the normal completion path. This + * is likely a request from nvmet_fc_send_ls_req_async. + */ + while ((lsop = list_first_entry_or_null(&tgtport->ls_req_list, + struct nvmet_fc_ls_req_op, lsreq_list))) { + list_del(&lsop->lsreq_list); + + if (!lsop->req_queued) + continue; + + lsreq = &lsop->ls_req; + fc_dma_unmap_single(tgtport->dev, lsreq->rqstdma, + (lsreq->rqstlen + lsreq->rsplen), + DMA_BIDIRECTIONAL); + nvmet_fc_tgtport_put(tgtport); + kfree(lsop); + } +} + /** * nvmet_fc_unregister_targetport - transport entry point called by an * LLDD to deregister/remove a previously @@ -1611,13 +1644,7 @@ nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port) flush_workqueue(nvmet_wq); - /* - * should terminate LS's as well. However, LS's will be generated - * at the tail end of association termination, so they likely don't - * exist yet. And even if they did, it's worthwhile to just let - * them finish and targetport ref counting will clean things up. - */ - + nvmet_fc_free_pending_reqs(tgtport); nvmet_fc_tgtport_put(tgtport); return 0; From 596cba55adb473f26bfd3f6dd78f169810069531 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:09 +0200 Subject: [PATCH 25/42] nvmet-fc: take tgtport refs for portentry Ensure that the tgtport is not going away as long portentry has a pointer on it. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 44 +++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index b8985559d1f1..254537b93e63 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1257,6 +1257,7 @@ nvmet_fc_portentry_bind(struct nvmet_fc_tgtport *tgtport, { lockdep_assert_held(&nvmet_fc_tgtlock); + nvmet_fc_tgtport_get(tgtport); pe->tgtport = tgtport; tgtport->pe = pe; @@ -1276,8 +1277,10 @@ nvmet_fc_portentry_unbind(struct nvmet_fc_port_entry *pe) unsigned long flags; spin_lock_irqsave(&nvmet_fc_tgtlock, flags); - if (pe->tgtport) + if (pe->tgtport) { + nvmet_fc_tgtport_put(pe->tgtport); pe->tgtport->pe = NULL; + } list_del(&pe->pe_list); spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); } @@ -1295,8 +1298,10 @@ nvmet_fc_portentry_unbind_tgt(struct nvmet_fc_tgtport *tgtport) spin_lock_irqsave(&nvmet_fc_tgtlock, flags); pe = tgtport->pe; - if (pe) + if (pe) { + nvmet_fc_tgtport_put(pe->tgtport); pe->tgtport = NULL; + } tgtport->pe = NULL; spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); } @@ -1319,6 +1324,9 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport) list_for_each_entry(pe, &nvmet_fc_portentry_list, pe_list) { if (tgtport->fc_target_port.node_name == pe->node_name && tgtport->fc_target_port.port_name == pe->port_name) { + if (!nvmet_fc_tgtport_get(tgtport)) + continue; + WARN_ON(pe->tgtport); tgtport->pe = pe; pe->tgtport = tgtport; @@ -2888,12 +2896,17 @@ nvmet_fc_add_port(struct nvmet_port *port) list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) { if ((tgtport->fc_target_port.node_name == traddr.nn) && (tgtport->fc_target_port.port_name == traddr.pn)) { + if (!nvmet_fc_tgtport_get(tgtport)) + continue; + /* a FC port can only be 1 nvmet port id */ if (!tgtport->pe) { nvmet_fc_portentry_bind(tgtport, pe, port); ret = 0; } else ret = -EALREADY; + + nvmet_fc_tgtport_put(tgtport); break; } } @@ -2909,11 +2922,21 @@ static void nvmet_fc_remove_port(struct nvmet_port *port) { struct nvmet_fc_port_entry *pe = port->priv; + struct nvmet_fc_tgtport *tgtport = NULL; + unsigned long flags; + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport)) + tgtport = pe->tgtport; + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); nvmet_fc_portentry_unbind(pe); - /* terminate any outstanding associations */ - __nvmet_fc_free_assocs(pe->tgtport); + if (tgtport) { + /* terminate any outstanding associations */ + __nvmet_fc_free_assocs(tgtport); + nvmet_fc_tgtport_put(tgtport); + } kfree(pe); } @@ -2922,10 +2945,21 @@ static void nvmet_fc_discovery_chg(struct nvmet_port *port) { struct nvmet_fc_port_entry *pe = port->priv; - struct nvmet_fc_tgtport *tgtport = pe->tgtport; + struct nvmet_fc_tgtport *tgtport = NULL; + unsigned long flags; + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport)) + tgtport = pe->tgtport; + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); + + if (!tgtport) + return; if (tgtport && tgtport->ops->discovery_event) tgtport->ops->discovery_event(&tgtport->fc_target_port); + + nvmet_fc_tgtport_put(tgtport); } static ssize_t From d7f7c6eb809ae36466a503b20689956b86e13827 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:02 +0200 Subject: [PATCH 26/42] nvmet-fcloop: add missing fcloop_callback_host_done Add the missing fcloop_call_host_done calls so that the caller frees resources when something goes wrong. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 4c60eaf0b653..bac60c6de6cf 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1002,9 +1002,10 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, } spin_unlock(&inireq->inilock); - if (!tfcp_req) + if (!tfcp_req) { /* abort has already been called */ - return; + goto out_host_done; + } /* break initiator/target relationship for io */ spin_lock_irqsave(&tfcp_req->reqlock, flags); @@ -1019,7 +1020,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, default: spin_unlock_irqrestore(&tfcp_req->reqlock, flags); WARN_ON(1); - return; + goto out_host_done; } spin_unlock_irqrestore(&tfcp_req->reqlock, flags); @@ -1033,6 +1034,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, */ fcloop_tfcp_req_put(tfcp_req); } + + return; + +out_host_done: + fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); } static void From 3466b7a6b713071190888526d3b9c58cda60b55f Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:06 +0200 Subject: [PATCH 27/42] nvmet-fcloop: don't wait for lport cleanup The lifetime of the fcloop_lsreq is not tight to the lifetime of the host or target port, thus there is no need anymore to synchronize the cleanup path anymore. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index bac60c6de6cf..257b497d515a 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -207,7 +207,6 @@ static LIST_HEAD(fcloop_nports); struct fcloop_lport { struct nvme_fc_local_port *localport; struct list_head lport_list; - struct completion unreg_done; refcount_t ref; }; @@ -1092,9 +1091,6 @@ fcloop_localport_delete(struct nvme_fc_local_port *localport) struct fcloop_lport_priv *lport_priv = localport->private; struct fcloop_lport *lport = lport_priv->lport; - /* release any threads waiting for the unreg to complete */ - complete(&lport->unreg_done); - fcloop_lport_put(lport); } @@ -1243,18 +1239,9 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr, } static int -__wait_localport_unreg(struct fcloop_lport *lport) +__localport_unreg(struct fcloop_lport *lport) { - int ret; - - init_completion(&lport->unreg_done); - - ret = nvme_fc_unregister_localport(lport->localport); - - if (!ret) - wait_for_completion(&lport->unreg_done); - - return ret; + return nvme_fc_unregister_localport(lport->localport); } static struct fcloop_nport * @@ -1337,7 +1324,7 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, if (!lport) return -ENOENT; - ret = __wait_localport_unreg(lport); + ret = __localport_unreg(lport); fcloop_lport_put(lport); return ret ? ret : count; @@ -1783,7 +1770,7 @@ static void __exit fcloop_exit(void) spin_unlock_irqrestore(&fcloop_lock, flags); - ret = __wait_localport_unreg(lport); + ret = __localport_unreg(lport); if (ret) pr_warn("%s: Failed deleting local port\n", __func__); From 0164d1350a651fd208a8c7138443dc4af82e0fa5 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 7 May 2025 14:23:10 +0200 Subject: [PATCH 28/42] nvme-fc: do not reference lsrsp after failure The lsrsp object is maintained by the LLDD. The lifetime of the lsrsp object is implicit. Because there is no explicit cleanup/free call into the LLDD, it is not safe to assume after xml_rsp_fails, that the lsrsp is still valid. The LLDD could have freed the object already. With the recent changes how fcloop tracks the resources, this is the case. Thus don't access lsrsp after xml_rsp_fails. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2257c3c96dd2..fdafa3e9e66f 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1410,9 +1410,8 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) } static void -nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) +nvme_fc_xmt_ls_rsp_free(struct nvmefc_ls_rcv_op *lsop) { - struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private; struct nvme_fc_rport *rport = lsop->rport; struct nvme_fc_lport *lport = rport->lport; unsigned long flags; @@ -1433,6 +1432,14 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) nvme_fc_rport_put(rport); } +static void +nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) +{ + struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private; + + nvme_fc_xmt_ls_rsp_free(lsop); +} + static void nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop) { @@ -1450,7 +1457,7 @@ nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop) dev_warn(lport->dev, "LLDD rejected LS RSP xmt: LS %d status %d\n", w0->ls_cmd, ret); - nvme_fc_xmt_ls_rsp_done(lsop->lsrsp); + nvme_fc_xmt_ls_rsp_free(lsop); return; } } From 1c9a93bf1d01729c54e9acbaa538db81f16563b2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 25 Apr 2025 20:06:34 -0600 Subject: [PATCH 29/42] dmapool: add NUMA affinity support Introduce dma_pool_create_node(), like dma_pool_create() but taking an additional NUMA node argument. Allocate struct dma_pool on the desired node, and store the node on dma_pool for allocating struct dma_page. Make dma_pool_create() an alias for dma_pool_create_node() with node set to NUMA_NO_NODE. Signed-off-by: Keith Busch Signed-off-by: Caleb Sander Mateos Reviewed-by: Jens Axboe Reviewed-by: John Garry Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- include/linux/dmapool.h | 21 ++++++++++++++++----- mm/dmapool.c | 15 +++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h index f632ecfb4238..06c4de602b2f 100644 --- a/include/linux/dmapool.h +++ b/include/linux/dmapool.h @@ -11,6 +11,7 @@ #ifndef LINUX_DMAPOOL_H #define LINUX_DMAPOOL_H +#include #include #include @@ -18,8 +19,8 @@ struct device; #ifdef CONFIG_HAS_DMA -struct dma_pool *dma_pool_create(const char *name, struct device *dev, - size_t size, size_t align, size_t allocation); +struct dma_pool *dma_pool_create_node(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary, int node); void dma_pool_destroy(struct dma_pool *pool); @@ -35,9 +36,12 @@ struct dma_pool *dmam_pool_create(const char *name, struct device *dev, void dmam_pool_destroy(struct dma_pool *pool); #else /* !CONFIG_HAS_DMA */ -static inline struct dma_pool *dma_pool_create(const char *name, - struct device *dev, size_t size, size_t align, size_t allocation) -{ return NULL; } +static inline struct dma_pool *dma_pool_create_node(const char *name, + struct device *dev, size_t size, size_t align, size_t boundary, + int node) +{ + return NULL; +} static inline void dma_pool_destroy(struct dma_pool *pool) { } static inline void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle) { return NULL; } @@ -49,6 +53,13 @@ static inline struct dma_pool *dmam_pool_create(const char *name, static inline void dmam_pool_destroy(struct dma_pool *pool) { } #endif /* !CONFIG_HAS_DMA */ +static inline struct dma_pool *dma_pool_create(const char *name, + struct device *dev, size_t size, size_t align, size_t boundary) +{ + return dma_pool_create_node(name, dev, size, align, boundary, + NUMA_NO_NODE); +} + static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle) { diff --git a/mm/dmapool.c b/mm/dmapool.c index f0bfc6c490f4..5be8cc1c6529 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -56,6 +56,7 @@ struct dma_pool { /* the pool */ unsigned int size; unsigned int allocation; unsigned int boundary; + int node; char name[32]; struct list_head pools; }; @@ -199,12 +200,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, /** - * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma. * @name: name of pool, for diagnostics * @dev: device that will be doing the DMA * @size: size of the blocks in this pool. * @align: alignment requirement for blocks; must be a power of two * @boundary: returned blocks won't cross this power of two boundary + * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on * Context: not in_interrupt() * * Given one of these pools, dma_pool_alloc() @@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, * Return: a dma allocation pool with the requested characteristics, or * %NULL if one can't be created. */ -struct dma_pool *dma_pool_create(const char *name, struct device *dev, - size_t size, size_t align, size_t boundary) +struct dma_pool *dma_pool_create_node(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary, int node) { struct dma_pool *retval; size_t allocation; @@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, boundary = min(boundary, allocation); - retval = kzalloc(sizeof(*retval), GFP_KERNEL); + retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; @@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; + retval->node = node; INIT_LIST_HEAD(&retval->pools); /* @@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, mutex_unlock(&pools_reg_lock); return retval; } -EXPORT_SYMBOL(dma_pool_create); +EXPORT_SYMBOL(dma_pool_create_node); static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) { @@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) { struct dma_page *page; - page = kmalloc(sizeof(*page), mem_flags); + page = kmalloc_node(sizeof(*page), mem_flags, pool->node); if (!page) return NULL; From b9d1ec530cdb631a3298de97c7b2ccc6145e1798 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 25 Apr 2025 20:06:35 -0600 Subject: [PATCH 30/42] nvme-pci: factor out a nvme_init_hctx_common() helper nvme_init_hctx() and nvme_admin_init_hctx() are very similar. In preparation for adding more logic, factor out a nvme_init_hctx-common() helper. Signed-off-by: Caleb Sander Mateos Reviewed-by: Jens Axboe Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2e30e9be7408..30abf134c886 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -397,28 +397,30 @@ static int nvme_pci_npages_prp(void) return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } -static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) +static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data, + unsigned qid) { struct nvme_dev *dev = to_nvme_dev(data); - struct nvme_queue *nvmeq = &dev->queues[0]; - - WARN_ON(hctx_idx != 0); - WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); + struct nvme_queue *nvmeq = &dev->queues[qid]; + struct blk_mq_tags *tags; + tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0]; + WARN_ON(tags != hctx->tags); hctx->driver_data = nvmeq; return 0; } -static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) { - struct nvme_dev *dev = to_nvme_dev(data); - struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; + WARN_ON(hctx_idx != 0); + return nvme_init_hctx_common(hctx, data, 0); +} - WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); - hctx->driver_data = nvmeq; - return 0; +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + return nvme_init_hctx_common(hctx, data, hctx_idx + 1); } static int nvme_pci_init_request(struct blk_mq_tag_set *set, From d977506f8863807129d7a11f4057dfb1b38085ea Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 12 May 2025 15:50:40 +0200 Subject: [PATCH 31/42] nvme-pci: make PRP list DMA pools per-NUMA-node NVMe commands with over 8 KB of discontiguous data allocate PRP list pages from the per-nvme_device dma_pool prp_page_pool or prp_small_pool. Each call to dma_pool_alloc() and dma_pool_free() takes the per-dma_pool spinlock. These device-global spinlocks are a significant source of contention when many CPUs are submitting to the same NVMe devices. On a workload issuing 32 KB reads from 16 CPUs (8 hypertwin pairs) across 2 NUMA nodes to 23 NVMe devices, we observed 2.4% of CPU time spent in _raw_spin_lock_irqsave called from dma_pool_alloc and dma_pool_free. Ideally, the dma_pools would be per-hctx to minimize contention. But that could impose considerable resource costs in a system with many NVMe devices and CPUs. As a compromise, allocate per-NUMA-node PRP list DMA pools. Map each nvme_queue to the set of DMA pools corresponding to its device and its hctx's NUMA node. This reduces the _raw_spin_lock_irqsave overhead by about half, to 1.2%. Preventing the sharing of PRP list pages across NUMA nodes also makes them cheaper to initialize. Link: https://lore.kernel.org/linux-nvme/CADUfDZqa=OOTtTTznXRDmBQo1WrFcDw1hBA7XwM7hzJ-hpckcA@mail.gmail.com/T/#u Signed-off-by: Caleb Sander Mateos Reviewed-by: Jens Axboe Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 144 +++++++++++++++++++++++----------------- 1 file changed, 84 insertions(+), 60 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 30abf134c886..e6b6b6ee0878 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static void nvme_delete_io_queues(struct nvme_dev *dev); static void nvme_update_attrs(struct nvme_dev *dev); +struct nvme_prp_dma_pools { + struct dma_pool *large; + struct dma_pool *small; +}; + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -121,8 +127,6 @@ struct nvme_dev { struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; struct device *dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; unsigned online_queues; unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; @@ -162,6 +166,7 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; + struct nvme_prp_dma_pools prp_pools[]; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -191,6 +196,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) */ struct nvme_queue { struct nvme_dev *dev; + struct nvme_prp_dma_pools prp_pools; spinlock_t sq_lock; void *sq_cmds; /* only used for poll queues: */ @@ -397,15 +403,64 @@ static int nvme_pci_npages_prp(void) return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } +static struct nvme_prp_dma_pools * +nvme_setup_prp_pools(struct nvme_dev *dev, unsigned numa_node) +{ + struct nvme_prp_dma_pools *prp_pools = &dev->prp_pools[numa_node]; + size_t small_align = 256; + + if (prp_pools->small) + return prp_pools; /* already initialized */ + + prp_pools->large = dma_pool_create_node("prp list page", dev->dev, + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE, 0, + numa_node); + if (!prp_pools->large) + return ERR_PTR(-ENOMEM); + + if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) + small_align = 512; + + /* Optimisation for I/Os between 4k and 128k */ + prp_pools->small = dma_pool_create_node("prp list 256", dev->dev, + 256, small_align, 0, numa_node); + if (!prp_pools->small) { + dma_pool_destroy(prp_pools->large); + prp_pools->large = NULL; + return ERR_PTR(-ENOMEM); + } + + return prp_pools; +} + +static void nvme_release_prp_pools(struct nvme_dev *dev) +{ + unsigned i; + + for (i = 0; i < nr_node_ids; i++) { + struct nvme_prp_dma_pools *prp_pools = &dev->prp_pools[i]; + + dma_pool_destroy(prp_pools->large); + dma_pool_destroy(prp_pools->small); + } +} + static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data, unsigned qid) { struct nvme_dev *dev = to_nvme_dev(data); struct nvme_queue *nvmeq = &dev->queues[qid]; + struct nvme_prp_dma_pools *prp_pools; struct blk_mq_tags *tags; tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0]; WARN_ON(tags != hctx->tags); + prp_pools = nvme_setup_prp_pools(dev, hctx->numa_node); + if (IS_ERR(prp_pools)) + return PTR_ERR(prp_pools); + + nvmeq->prp_pools = *prp_pools; hctx->driver_data = nvmeq; return 0; } @@ -539,7 +594,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, return true; } -static void nvme_free_prps(struct nvme_dev *dev, struct request *req) +static void nvme_free_prps(struct nvme_queue *nvmeq, struct request *req) { const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -550,12 +605,13 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req) __le64 *prp_list = iod->list[i].prp_list; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); + dma_pool_free(nvmeq->prp_pools.large, prp_list, dma_addr); dma_addr = next_dma_addr; } } -static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) +static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq, + struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -570,13 +626,13 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); if (iod->nr_allocations == 0) - dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, + dma_pool_free(nvmeq->prp_pools.small, iod->list[0].sg_list, iod->first_dma); else if (iod->nr_allocations == 1) - dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, + dma_pool_free(nvmeq->prp_pools.large, iod->list[0].sg_list, iod->first_dma); else - nvme_free_prps(dev, req); + nvme_free_prps(nvmeq, req); mempool_free(iod->sgt.sgl, dev->iod_mempool); } @@ -594,7 +650,7 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents) } } -static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, +static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, struct request *req, struct nvme_rw_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -630,10 +686,10 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; + pool = nvmeq->prp_pools.small; iod->nr_allocations = 0; } else { - pool = dev->prp_page_pool; + pool = nvmeq->prp_pools.large; iod->nr_allocations = 1; } @@ -675,7 +731,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); return BLK_STS_OK; free_prps: - nvme_free_prps(dev, req); + nvme_free_prps(nvmeq, req); return BLK_STS_RESOURCE; bad_sgl: WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), @@ -700,7 +756,7 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } -static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, +static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, struct request *req, struct nvme_rw_command *cmd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -720,10 +776,10 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, } if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { - pool = dev->prp_small_pool; + pool = nvmeq->prp_pools.small; iod->nr_allocations = 0; } else { - pool = dev->prp_page_pool; + pool = nvmeq->prp_pools.large; iod->nr_allocations = 1; } @@ -787,12 +843,12 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret = BLK_STS_RESOURCE; int rc; if (blk_rq_nr_phys_segments(req) == 1) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { @@ -827,9 +883,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, } if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) - ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); + ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw); else - ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); + ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw); if (ret != BLK_STS_OK) goto out_unmap_sg; return BLK_STS_OK; @@ -844,6 +900,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, struct request *req) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_rw_command *cmnd = &iod->cmd.rw; struct nvme_sgl_desc *sg_list; @@ -867,7 +924,7 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, if (rc) goto out_free_sg; - sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma); + sg_list = dma_pool_alloc(nvmeq->prp_pools.small, GFP_ATOMIC, &sgl_dma); if (!sg_list) goto out_unmap_sg; @@ -949,7 +1006,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) return BLK_STS_OK; out_unmap_data: if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req); + nvme_unmap_data(dev, req->mq_hctx->driver_data, req); out_free_cmd: nvme_cleanup_cmd(req); return ret; @@ -1039,6 +1096,7 @@ static void nvme_queue_rqs(struct rq_list *rqlist) } static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, + struct nvme_queue *nvmeq, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1050,7 +1108,7 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, return; } - dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list, + dma_pool_free(nvmeq->prp_pools.small, iod->meta_list.sg_list, iod->meta_dma); dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); @@ -1062,10 +1120,10 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req) struct nvme_dev *dev = nvmeq->dev; if (blk_integrity_rq(req)) - nvme_unmap_metadata(dev, req); + nvme_unmap_metadata(dev, nvmeq, req); if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req); + nvme_unmap_data(dev, nvmeq, req); } static void nvme_pci_complete_rq(struct request *req) @@ -2842,35 +2900,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) return 0; } -static int nvme_setup_prp_pools(struct nvme_dev *dev) -{ - size_t small_align = 256; - - dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, - NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE, 0); - if (!dev->prp_page_pool) - return -ENOMEM; - - if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) - small_align = 512; - - /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, - 256, small_align, 0); - if (!dev->prp_small_pool) { - dma_pool_destroy(dev->prp_page_pool); - return -ENOMEM; - } - return 0; -} - -static void nvme_release_prp_pools(struct nvme_dev *dev) -{ - dma_pool_destroy(dev->prp_page_pool); - dma_pool_destroy(dev->prp_small_pool); -} - static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); @@ -3185,7 +3214,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, struct nvme_dev *dev; int ret = -ENOMEM; - dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); + dev = kzalloc_node(sizeof(*dev) + nr_node_ids * sizeof(*dev->prp_pools), + GFP_KERNEL, node); if (!dev) return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); @@ -3260,13 +3290,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto out_uninit_ctrl; - result = nvme_setup_prp_pools(dev); - if (result) - goto out_dev_unmap; - result = nvme_pci_alloc_iod_mempool(dev); if (result) - goto out_release_prp_pools; + goto out_dev_unmap; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); @@ -3342,8 +3368,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) out_release_iod_mempool: mempool_destroy(dev->iod_mempool); mempool_destroy(dev->iod_meta_mempool); -out_release_prp_pools: - nvme_release_prp_pools(dev); out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: From a40c20a605ed48159ea3d9a2bd4532c167995aa3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 11 May 2025 06:04:27 +0200 Subject: [PATCH 32/42] nvme-pci: don't try to use SGLs for metadata on the admin queue No admin command defined in an NVMe specification supports metadata, but to protect against vendor specific commands using metadata ensure that we don't try to use SGLs for metadata on the admin queue, as NVMe does not support SGLs on the admin queue for the PCI transport. Do this by checking if the data transfer has been setup using SGLs as that is required for using SGLs for metadata. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Leon Romanovsky --- drivers/nvme/host/pci.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e6b6b6ee0878..24f5292a349b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -971,7 +971,10 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev, static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req) { - if (nvme_pci_metadata_use_sgls(dev, req)) + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) && + nvme_pci_metadata_use_sgls(dev, req)) return nvme_pci_setup_meta_sgls(dev, req); return nvme_pci_setup_meta_mptr(dev, req); } From 906573c3bfe382d326952c1a72ee9a06448c1db1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 10 May 2025 05:47:03 +0200 Subject: [PATCH 33/42] nvme-pci: store aborted state in flags variable Instead of keeping dedicated "bool aborted" variable, switch to a flags flags that can be used for other flags as well. Signed-off-by: Leon Romanovsky Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Caleb Sander Mateos --- drivers/nvme/host/pci.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 24f5292a349b..51430f5e6a66 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -230,6 +230,12 @@ union nvme_descriptor { __le64 *prp_list; }; +/* bits for iod->flags */ +enum nvme_iod_flags { + /* this command has been aborted by the timeout handler */ + IOD_ABORTED = 1U << 0, +}; + /* * The nvme_iod describes the data in an I/O. * @@ -239,7 +245,7 @@ union nvme_descriptor { struct nvme_iod { struct nvme_request req; struct nvme_command cmd; - bool aborted; + u8 flags; s8 nr_allocations; /* PRP list pool allocations. 0 means small pool in use */ unsigned int dma_len; /* length of single DMA segment mapping */ @@ -984,7 +990,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; - iod->aborted = false; + iod->flags = 0; iod->nr_allocations = -1; iod->sgt.nents = 0; iod->meta_sgt.nents = 0; @@ -1551,7 +1557,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) * returned to the driver, or if this is the admin queue. */ opcode = nvme_req(req)->cmd->common.opcode; - if (!nvmeq->qid || iod->aborted) { + if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) { dev_warn(dev->ctrl.device, "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n", req->tag, nvme_cid(req), opcode, @@ -1564,7 +1570,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } - iod->aborted = true; + iod->flags |= IOD_ABORTED; cmd.abort.opcode = nvme_admin_abort_cmd; cmd.abort.cid = nvme_cid(req); From 1755b32516bb42123b0030080226d6079999621d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 10 May 2025 05:49:41 +0200 Subject: [PATCH 34/42] nvme-pci: remove struct nvme_descriptor There is no real point in having a union of two pointer types here, just use a void pointer as we mix and match types between the arms of the union between the allocation and freeing side already. Also rename the nr_allocations field to nr_descriptors to better describe what it does. Signed-off-by: Christoph Hellwig [leon: ported forward to include metadata SGL support] Signed-off-by: Leon Romanovsky Reviewed-by: Keith Busch Reviewed-by: Kanchan Joshi --- drivers/nvme/host/pci.c | 57 +++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 51430f5e6a66..f332b4ee7809 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -44,7 +44,7 @@ #define NVME_MAX_KB_SZ 8192 #define NVME_MAX_SEGS 128 #define NVME_MAX_META_SEGS 15 -#define NVME_MAX_NR_ALLOCATIONS 5 +#define NVME_MAX_NR_DESCRIPTORS 5 static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0444); @@ -225,11 +225,6 @@ struct nvme_queue { struct completion delete_done; }; -union nvme_descriptor { - struct nvme_sgl_desc *sg_list; - __le64 *prp_list; -}; - /* bits for iod->flags */ enum nvme_iod_flags { /* this command has been aborted by the timeout handler */ @@ -238,23 +233,19 @@ enum nvme_iod_flags { /* * The nvme_iod describes the data in an I/O. - * - * The sg pointer contains the list of PRP/SGL chunk allocations in addition - * to the actual struct scatterlist. */ struct nvme_iod { struct nvme_request req; struct nvme_command cmd; u8 flags; - s8 nr_allocations; /* PRP list pool allocations. 0 means small - pool in use */ + s8 nr_descriptors; unsigned int dma_len; /* length of single DMA segment mapping */ dma_addr_t first_dma; dma_addr_t meta_dma; struct sg_table sgt; struct sg_table meta_sgt; - union nvme_descriptor meta_list; - union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; + struct nvme_sgl_desc *meta_descriptor; + void *descriptors[NVME_MAX_NR_DESCRIPTORS]; }; static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) @@ -607,8 +598,8 @@ static void nvme_free_prps(struct nvme_queue *nvmeq, struct request *req) dma_addr_t dma_addr = iod->first_dma; int i; - for (i = 0; i < iod->nr_allocations; i++) { - __le64 *prp_list = iod->list[i].prp_list; + for (i = 0; i < iod->nr_descriptors; i++) { + __le64 *prp_list = iod->descriptors[i]; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); dma_pool_free(nvmeq->prp_pools.large, prp_list, dma_addr); @@ -631,11 +622,11 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq, dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); - if (iod->nr_allocations == 0) - dma_pool_free(nvmeq->prp_pools.small, iod->list[0].sg_list, + if (iod->nr_descriptors == 0) + dma_pool_free(nvmeq->prp_pools.small, iod->descriptors[0], iod->first_dma); - else if (iod->nr_allocations == 1) - dma_pool_free(nvmeq->prp_pools.large, iod->list[0].sg_list, + else if (iod->nr_descriptors == 1) + dma_pool_free(nvmeq->prp_pools.large, iod->descriptors[0], iod->first_dma); else nvme_free_prps(nvmeq, req); @@ -693,18 +684,18 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); if (nprps <= (256 / 8)) { pool = nvmeq->prp_pools.small; - iod->nr_allocations = 0; + iod->nr_descriptors = 0; } else { pool = nvmeq->prp_pools.large; - iod->nr_allocations = 1; + iod->nr_descriptors = 1; } prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) { - iod->nr_allocations = -1; + iod->nr_descriptors = -1; return BLK_STS_RESOURCE; } - iod->list[0].prp_list = prp_list; + iod->descriptors[0] = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { @@ -713,7 +704,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) goto free_prps; - iod->list[iod->nr_allocations++].prp_list = prp_list; + iod->descriptors[iod->nr_descriptors++] = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); i = 1; @@ -783,19 +774,19 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { pool = nvmeq->prp_pools.small; - iod->nr_allocations = 0; + iod->nr_descriptors = 0; } else { pool = nvmeq->prp_pools.large; - iod->nr_allocations = 1; + iod->nr_descriptors = 1; } sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); if (!sg_list) { - iod->nr_allocations = -1; + iod->nr_descriptors = -1; return BLK_STS_RESOURCE; } - iod->list[0].sg_list = sg_list; + iod->descriptors[0] = sg_list; iod->first_dma = sgl_dma; nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); @@ -935,7 +926,7 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, goto out_unmap_sg; entries = iod->meta_sgt.nents; - iod->meta_list.sg_list = sg_list; + iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; cmnd->flags = NVME_CMD_SGL_METASEG; @@ -991,7 +982,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) blk_status_t ret; iod->flags = 0; - iod->nr_allocations = -1; + iod->nr_descriptors = -1; iod->sgt.nents = 0; iod->meta_sgt.nents = 0; @@ -1117,8 +1108,8 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, return; } - dma_pool_free(nvmeq->prp_pools.small, iod->meta_list.sg_list, - iod->meta_dma); + dma_pool_free(nvmeq->prp_pools.small, iod->meta_descriptor, + iod->meta_dma); dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); } @@ -3842,7 +3833,7 @@ static int __init nvme_init(void) BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE); BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE); - BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS); + BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS); return pci_register_driver(&nvme_driver); } From 357b536b3633afc174905f791cc00fd9fd2932b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 May 2025 17:13:33 +0200 Subject: [PATCH 35/42] nvme-pci: rename the descriptor pools They are used for both PRPs and SGLs, and we use descriptor elsewhere when referring to their allocations, so use that name here as well. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Caleb Sander Mateos Reviewed-by: Leon Romanovsky --- drivers/nvme/host/pci.c | 84 ++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f332b4ee7809..b880168ab520 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -113,7 +113,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static void nvme_delete_io_queues(struct nvme_dev *dev); static void nvme_update_attrs(struct nvme_dev *dev); -struct nvme_prp_dma_pools { +struct nvme_descriptor_pools { struct dma_pool *large; struct dma_pool *small; }; @@ -166,7 +166,7 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; - struct nvme_prp_dma_pools prp_pools[]; + struct nvme_descriptor_pools descriptor_pools[]; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -196,7 +196,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) */ struct nvme_queue { struct nvme_dev *dev; - struct nvme_prp_dma_pools prp_pools; + struct nvme_descriptor_pools descriptor_pools; spinlock_t sq_lock; void *sq_cmds; /* only used for poll queues: */ @@ -400,46 +400,44 @@ static int nvme_pci_npages_prp(void) return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } -static struct nvme_prp_dma_pools * -nvme_setup_prp_pools(struct nvme_dev *dev, unsigned numa_node) +static struct nvme_descriptor_pools * +nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) { - struct nvme_prp_dma_pools *prp_pools = &dev->prp_pools[numa_node]; + struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node]; size_t small_align = 256; - if (prp_pools->small) - return prp_pools; /* already initialized */ + if (pools->small) + return pools; /* already initialized */ - prp_pools->large = dma_pool_create_node("prp list page", dev->dev, - NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE, 0, - numa_node); - if (!prp_pools->large) + pools->large = dma_pool_create_node("nvme descriptor page", dev->dev, + NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node); + if (!pools->large) return ERR_PTR(-ENOMEM); if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) small_align = 512; /* Optimisation for I/Os between 4k and 128k */ - prp_pools->small = dma_pool_create_node("prp list 256", dev->dev, - 256, small_align, 0, numa_node); - if (!prp_pools->small) { - dma_pool_destroy(prp_pools->large); - prp_pools->large = NULL; + pools->small = dma_pool_create_node("nvme descriptor 256", dev->dev, + 256, small_align, 0, numa_node); + if (!pools->small) { + dma_pool_destroy(pools->large); + pools->large = NULL; return ERR_PTR(-ENOMEM); } - return prp_pools; + return pools; } -static void nvme_release_prp_pools(struct nvme_dev *dev) +static void nvme_release_descriptor_pools(struct nvme_dev *dev) { unsigned i; for (i = 0; i < nr_node_ids; i++) { - struct nvme_prp_dma_pools *prp_pools = &dev->prp_pools[i]; + struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i]; - dma_pool_destroy(prp_pools->large); - dma_pool_destroy(prp_pools->small); + dma_pool_destroy(pools->large); + dma_pool_destroy(pools->small); } } @@ -448,16 +446,16 @@ static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data, { struct nvme_dev *dev = to_nvme_dev(data); struct nvme_queue *nvmeq = &dev->queues[qid]; - struct nvme_prp_dma_pools *prp_pools; + struct nvme_descriptor_pools *pools; struct blk_mq_tags *tags; tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0]; WARN_ON(tags != hctx->tags); - prp_pools = nvme_setup_prp_pools(dev, hctx->numa_node); - if (IS_ERR(prp_pools)) - return PTR_ERR(prp_pools); + pools = nvme_setup_descriptor_pools(dev, hctx->numa_node); + if (IS_ERR(pools)) + return PTR_ERR(pools); - nvmeq->prp_pools = *prp_pools; + nvmeq->descriptor_pools = *pools; hctx->driver_data = nvmeq; return 0; } @@ -602,7 +600,8 @@ static void nvme_free_prps(struct nvme_queue *nvmeq, struct request *req) __le64 *prp_list = iod->descriptors[i]; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(nvmeq->prp_pools.large, prp_list, dma_addr); + dma_pool_free(nvmeq->descriptor_pools.large, prp_list, + dma_addr); dma_addr = next_dma_addr; } } @@ -623,11 +622,11 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq, dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); if (iod->nr_descriptors == 0) - dma_pool_free(nvmeq->prp_pools.small, iod->descriptors[0], - iod->first_dma); + dma_pool_free(nvmeq->descriptor_pools.small, + iod->descriptors[0], iod->first_dma); else if (iod->nr_descriptors == 1) - dma_pool_free(nvmeq->prp_pools.large, iod->descriptors[0], - iod->first_dma); + dma_pool_free(nvmeq->descriptor_pools.large, + iod->descriptors[0], iod->first_dma); else nvme_free_prps(nvmeq, req); mempool_free(iod->sgt.sgl, dev->iod_mempool); @@ -683,10 +682,10 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); if (nprps <= (256 / 8)) { - pool = nvmeq->prp_pools.small; + pool = nvmeq->descriptor_pools.small; iod->nr_descriptors = 0; } else { - pool = nvmeq->prp_pools.large; + pool = nvmeq->descriptor_pools.large; iod->nr_descriptors = 1; } @@ -773,10 +772,10 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, } if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { - pool = nvmeq->prp_pools.small; + pool = nvmeq->descriptor_pools.small; iod->nr_descriptors = 0; } else { - pool = nvmeq->prp_pools.large; + pool = nvmeq->descriptor_pools.large; iod->nr_descriptors = 1; } @@ -921,7 +920,8 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, if (rc) goto out_free_sg; - sg_list = dma_pool_alloc(nvmeq->prp_pools.small, GFP_ATOMIC, &sgl_dma); + sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, + &sgl_dma); if (!sg_list) goto out_unmap_sg; @@ -1108,7 +1108,7 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, return; } - dma_pool_free(nvmeq->prp_pools.small, iod->meta_descriptor, + dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor, iod->meta_dma); dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); @@ -3214,8 +3214,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, struct nvme_dev *dev; int ret = -ENOMEM; - dev = kzalloc_node(sizeof(*dev) + nr_node_ids * sizeof(*dev->prp_pools), - GFP_KERNEL, node); + dev = kzalloc_node(sizeof(*dev) + nr_node_ids * + sizeof(*dev->descriptor_pools), GFP_KERNEL, node); if (!dev) return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); @@ -3432,7 +3432,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_queues(dev, 0); mempool_destroy(dev->iod_mempool); mempool_destroy(dev->iod_meta_mempool); - nvme_release_prp_pools(dev); + nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); } From a43d304f3abea73883f99287396a6e1eb57c3637 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 May 2025 17:16:04 +0200 Subject: [PATCH 36/42] nvme-pci: use a better encoding for small prp pool allocations Add a separate flag to encode that the transfer is using the small page sized pool, and use a normal 0..n count for the number of descriptors. Contains improvements and suggestions from Kanchan Joshi and Leon Romanovsky . Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Leon Romanovsky --- drivers/nvme/host/pci.c | 82 ++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b880168ab520..5017d6c56519 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -229,6 +229,9 @@ struct nvme_queue { enum nvme_iod_flags { /* this command has been aborted by the timeout handler */ IOD_ABORTED = 1U << 0, + + /* uses the small descriptor pool */ + IOD_SMALL_DESCRIPTOR = 1U << 1, }; /* @@ -238,7 +241,7 @@ struct nvme_iod { struct nvme_request req; struct nvme_command cmd; u8 flags; - s8 nr_descriptors; + u8 nr_descriptors; unsigned int dma_len; /* length of single DMA segment mapping */ dma_addr_t first_dma; dma_addr_t meta_dma; @@ -589,13 +592,27 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, return true; } -static void nvme_free_prps(struct nvme_queue *nvmeq, struct request *req) +static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, + struct nvme_iod *iod) +{ + if (iod->flags & IOD_SMALL_DESCRIPTOR) + return nvmeq->descriptor_pools.small; + return nvmeq->descriptor_pools.large; +} + +static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req) { const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); dma_addr_t dma_addr = iod->first_dma; int i; + if (iod->nr_descriptors == 1) { + dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0], + dma_addr); + return; + } + for (i = 0; i < iod->nr_descriptors; i++) { __le64 *prp_list = iod->descriptors[i]; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); @@ -620,15 +637,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq, WARN_ON_ONCE(!iod->sgt.nents); dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); - - if (iod->nr_descriptors == 0) - dma_pool_free(nvmeq->descriptor_pools.small, - iod->descriptors[0], iod->first_dma); - else if (iod->nr_descriptors == 1) - dma_pool_free(nvmeq->descriptor_pools.large, - iod->descriptors[0], iod->first_dma); - else - nvme_free_prps(nvmeq, req); + nvme_free_descriptors(nvmeq, req); mempool_free(iod->sgt.sgl, dev->iod_mempool); } @@ -650,7 +659,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, struct request *req, struct nvme_rw_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; int length = blk_rq_payload_bytes(req); struct scatterlist *sg = iod->sgt.sgl; int dma_len = sg_dma_len(sg); @@ -658,7 +666,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); __le64 *prp_list; dma_addr_t prp_dma; - int nprps, i; + int i; length -= (NVME_CTRL_PAGE_SIZE - offset); if (length <= 0) { @@ -680,27 +688,23 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, goto done; } - nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = nvmeq->descriptor_pools.small; - iod->nr_descriptors = 0; - } else { - pool = nvmeq->descriptor_pools.large; - iod->nr_descriptors = 1; - } + if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <= + 256 / sizeof(__le64)) + iod->flags |= IOD_SMALL_DESCRIPTOR; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) { - iod->nr_descriptors = -1; + prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &prp_dma); + if (!prp_list) return BLK_STS_RESOURCE; - } - iod->descriptors[0] = prp_list; + iod->descriptors[iod->nr_descriptors++] = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + + prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large, + GFP_ATOMIC, &prp_dma); if (!prp_list) goto free_prps; iod->descriptors[iod->nr_descriptors++] = prp_list; @@ -727,7 +731,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); return BLK_STS_OK; free_prps: - nvme_free_prps(nvmeq, req); + nvme_free_descriptors(nvmeq, req); return BLK_STS_RESOURCE; bad_sgl: WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), @@ -756,7 +760,6 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, struct request *req, struct nvme_rw_command *cmd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; struct nvme_sgl_desc *sg_list; struct scatterlist *sg = iod->sgt.sgl; unsigned int entries = iod->sgt.nents; @@ -771,21 +774,14 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, return BLK_STS_OK; } - if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { - pool = nvmeq->descriptor_pools.small; - iod->nr_descriptors = 0; - } else { - pool = nvmeq->descriptor_pools.large; - iod->nr_descriptors = 1; - } + if (entries <= 256 / sizeof(*sg_list)) + iod->flags |= IOD_SMALL_DESCRIPTOR; - sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); - if (!sg_list) { - iod->nr_descriptors = -1; + sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &sgl_dma); + if (!sg_list) return BLK_STS_RESOURCE; - } - - iod->descriptors[0] = sg_list; + iod->descriptors[iod->nr_descriptors++] = sg_list; iod->first_dma = sgl_dma; nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); @@ -982,7 +978,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) blk_status_t ret; iod->flags = 0; - iod->nr_descriptors = -1; + iod->nr_descriptors = 0; iod->sgt.nents = 0; iod->meta_sgt.nents = 0; From f01e389e88b27a55674bc11d5d44dc75f0d83745 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 12 May 2025 17:17:27 +0200 Subject: [PATCH 37/42] nvme-pci: add a symolic name for the small pool size Open coding magic numbers in multiple places is never a good idea. Signed-off-by: Leon Romanovsky [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Caleb Sander Mateos --- drivers/nvme/host/pci.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5017d6c56519..2dbe6757e1a3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -37,6 +37,9 @@ #define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +/* Optimisation for I/Os between 4k and 128k */ +#define NVME_SMALL_POOL_SIZE 256 + /* * These can be higher, but we need to ensure that any command doesn't * require an sg allocation that needs more than a page of data. @@ -407,7 +410,7 @@ static struct nvme_descriptor_pools * nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) { struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node]; - size_t small_align = 256; + size_t small_align = NVME_SMALL_POOL_SIZE; if (pools->small) return pools; /* already initialized */ @@ -420,9 +423,8 @@ nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) small_align = 512; - /* Optimisation for I/Os between 4k and 128k */ - pools->small = dma_pool_create_node("nvme descriptor 256", dev->dev, - 256, small_align, 0, numa_node); + pools->small = dma_pool_create_node("nvme descriptor small", dev->dev, + NVME_SMALL_POOL_SIZE, small_align, 0, numa_node); if (!pools->small) { dma_pool_destroy(pools->large); pools->large = NULL; @@ -689,7 +691,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, } if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <= - 256 / sizeof(__le64)) + NVME_SMALL_POOL_SIZE / sizeof(__le64)) iod->flags |= IOD_SMALL_DESCRIPTOR; prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, @@ -774,7 +776,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, return BLK_STS_OK; } - if (entries <= 256 / sizeof(*sg_list)) + if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list)) iod->flags |= IOD_SMALL_DESCRIPTOR; sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, From de65e642644a20576af4e9d88fcbd911766c6b57 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 May 2025 17:20:44 +0200 Subject: [PATCH 38/42] nvme-pci: use struct_size for allocation struct nvme_dev This avoids open coding the variable size array arithmetics. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Caleb Sander Mateos Reviewed-by: Leon Romanovsky --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2dbe6757e1a3..1098eb17890b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3212,8 +3212,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, struct nvme_dev *dev; int ret = -ENOMEM; - dev = kzalloc_node(sizeof(*dev) + nr_node_ids * - sizeof(*dev->descriptor_pools), GFP_KERNEL, node); + dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids), + GFP_KERNEL, node); if (!dev) return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); From 414a4c93f1741b820597a746ff64d9cbc69d9f64 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 May 2025 09:06:49 +0200 Subject: [PATCH 39/42] nvme-pci: derive and better document max segments limits Redefine the max segments and max integrity limits based on the limiting factors. This keeps exactly the same values for 4k PAGE_SIZE systems, but increases the number of segments for larger page size as it properly derives the scatterlist allocation based limit for them instead of assuming a 4k PAGE_SIZE. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/pci.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1098eb17890b..94ed13903b1b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -35,8 +35,6 @@ #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) -#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) - /* Optimisation for I/Os between 4k and 128k */ #define NVME_SMALL_POOL_SIZE 256 @@ -45,10 +43,24 @@ * require an sg allocation that needs more than a page of data. */ #define NVME_MAX_KB_SZ 8192 -#define NVME_MAX_SEGS 128 -#define NVME_MAX_META_SEGS 15 #define NVME_MAX_NR_DESCRIPTORS 5 +/* + * For data SGLs we support a single descriptors worth of SGL entries, but for + * now we also limit it to avoid an allocation larger than PAGE_SIZE for the + * scatterlist. + */ +#define NVME_MAX_SEGS \ + min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \ + (PAGE_SIZE / sizeof(struct scatterlist))) + +/* + * For metadata SGLs, only the small descriptor is supported, and the first + * entry is the segment descriptor, which for the data pointer sits in the SQE. + */ +#define NVME_MAX_META_SEGS \ + ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1) + static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0444); @@ -3829,8 +3841,6 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE); - BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE); BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS); return pci_register_driver(&nvme_driver); From 62188639ec160eb77cb758c3a95947ebc918e76f Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 14 May 2025 18:33:15 +0530 Subject: [PATCH 40/42] nvme-multipath: introduce delayed removal of the multipath head node Currently, the multipath head node of an NVMe disk is removed immediately as soon as all paths of the disk are removed. However, this can cause issues in scenarios where: - The disk hot-removal followed by re-addition. - Transient PCIe link failures that trigger re-enumeration, temporarily removing and then restoring the disk. In these cases, removing the head node prematurely may lead to a head disk node name change upon re-addition, requiring applications to reopen their handles if they were performing I/O during the failure. To address this, introduce a delayed removal mechanism of head disk node. During transient failure, instead of immediate removal of head disk node, the system waits for a configurable timeout, allowing the disk to recover. During transient disk failure, if application sends any IO then we queue it instead of failing such IO immediately. If the disk comes back online within the timeout, the queued IOs are resubmitted to the disk ensuring seamless operation. In case disk couldn't recover from the failure then queued IOs are failed to its completion and application receives the error. So this way, if disk comes back online within the configured period, the head node remains unchanged, ensuring uninterrupted workloads without requiring applications to reopen device handles. A new sysfs attribute, named "delayed_removal_secs" is added under head disk blkdev for user who wish to configure time for the delayed removal of head disk node. The default value of this attribute is set to zero second ensuring no behavior change unless explicitly configured. Link: https://lore.kernel.org/linux-nvme/Y9oGTKCFlOscbPc2@infradead.org/ Link: https://lore.kernel.org/linux-nvme/Y+1aKcQgbskA2tra@kbusch-mbp.dhcp.thefacebook.com/ Suggested-by: Keith Busch Suggested-by: Christoph Hellwig [nilay: reworked based on the original idea/POC from Christoph and Keith] Reviewed-by: Hannes Reinecke Signed-off-by: Nilay Shroff Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 ++- drivers/nvme/host/multipath.c | 130 +++++++++++++++++++++++++++++++--- drivers/nvme/host/nvme.h | 16 ++++- drivers/nvme/host/sysfs.c | 7 ++ 4 files changed, 147 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index af871d268fcb..6d11f8b19633 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3743,7 +3743,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl, */ if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h)) continue; - if (!list_empty(&h->list) && nvme_tryget_ns_head(h)) + if (nvme_tryget_ns_head(h)) return h; } @@ -3987,7 +3987,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) } } else { ret = -EINVAL; - if (!info->is_shared || !head->shared) { + if ((!info->is_shared || !head->shared) && + !list_empty(&head->list)) { dev_err(ctrl->device, "Duplicate unshared namespace %d\n", info->nsid); @@ -4191,7 +4192,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) mutex_lock(&ns->ctrl->subsys->lock); list_del_rcu(&ns->siblings); if (list_empty(&ns->head->list)) { - list_del_init(&ns->head->entry); + if (!nvme_mpath_queue_if_no_path(ns->head)) + list_del_init(&ns->head->entry); last_path = true; } mutex_unlock(&ns->ctrl->subsys->lock); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 250f3da67cc9..2db326d6114f 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -442,7 +442,17 @@ static bool nvme_available_path(struct nvme_ns_head *head) break; } } - return false; + + /* + * If "head->delayed_removal_secs" is configured (i.e., non-zero), do + * not immediately fail I/O. Instead, requeue the I/O for the configured + * duration, anticipating that if there's a transient link failure then + * it may recover within this time window. This parameter is exported to + * userspace via sysfs, and its default value is zero. It is internally + * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is + * non-zero, this flag is set to true. When zero, the flag is cleared. + */ + return nvme_mpath_queue_if_no_path(head); } static void nvme_ns_head_submit_bio(struct bio *bio) @@ -617,6 +627,40 @@ static void nvme_requeue_work(struct work_struct *work) } } +static void nvme_remove_head(struct nvme_ns_head *head) +{ + if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + /* + * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared + * to allow multipath to fail all I/O. + */ + kblockd_schedule_work(&head->requeue_work); + + nvme_cdev_del(&head->cdev, &head->cdev_device); + synchronize_srcu(&head->srcu); + del_gendisk(head->disk); + nvme_put_ns_head(head); + } +} + +static void nvme_remove_head_work(struct work_struct *work) +{ + struct nvme_ns_head *head = container_of(to_delayed_work(work), + struct nvme_ns_head, remove_work); + bool shutdown = false; + + mutex_lock(&head->subsys->lock); + if (list_empty(&head->list)) { + list_del_init(&head->entry); + shutdown = true; + } + mutex_unlock(&head->subsys->lock); + if (shutdown) + nvme_remove_head(head); + + module_put(THIS_MODULE); +} + int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) { struct queue_limits lim; @@ -626,6 +670,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) spin_lock_init(&head->requeue_lock); INIT_WORK(&head->requeue_work, nvme_requeue_work); INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work); + INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work); + head->delayed_removal_secs = 0; /* * Add a multipath node if the subsystems supports multiple controllers. @@ -659,6 +705,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state); sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); + nvme_tryget_ns_head(head); return 0; } @@ -1015,6 +1062,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr } DEVICE_ATTR_RO(numa_nodes); +static ssize_t delayed_removal_secs_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + int ret; + + mutex_lock(&head->subsys->lock); + ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs); + mutex_unlock(&head->subsys->lock); + return ret; +} + +static ssize_t delayed_removal_secs_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + unsigned int sec; + int ret; + + ret = kstrtouint(buf, 0, &sec); + if (ret < 0) + return ret; + + mutex_lock(&head->subsys->lock); + head->delayed_removal_secs = sec; + if (sec) + set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags); + else + clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags); + mutex_unlock(&head->subsys->lock); + /* + * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen + * by its reader. + */ + synchronize_srcu(&head->srcu); + + return count; +} + +DEVICE_ATTR_RW(delayed_removal_secs); + static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { @@ -1138,18 +1228,38 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) { - if (!head->disk) - return; - if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { - nvme_cdev_del(&head->cdev, &head->cdev_device); + bool shutdown = false; + + mutex_lock(&head->subsys->lock); + /* + * We are called when all paths have been removed, and at that point + * head->list is expected to be empty. However, nvme_remove_ns() and + * nvme_init_ns_head() can run concurrently and so if head->delayed_ + * removal_secs is configured, it is possible that by the time we reach + * this point, head->list may no longer be empty. Therefore, we recheck + * head->list here. If it is no longer empty then we skip enqueuing the + * delayed head removal work. + */ + if (!list_empty(&head->list)) + goto out; + + if (head->delayed_removal_secs) { /* - * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared - * to allow multipath to fail all I/O. + * Ensure that no one could remove this module while the head + * remove work is pending. */ - synchronize_srcu(&head->srcu); - kblockd_schedule_work(&head->requeue_work); - del_gendisk(head->disk); + if (!try_module_get(THIS_MODULE)) + goto out; + queue_delayed_work(nvme_wq, &head->remove_work, + head->delayed_removal_secs * HZ); + } else { + list_del_init(&head->entry); + shutdown = true; } +out: + mutex_unlock(&head->subsys->lock); + if (shutdown) + nvme_remove_head(head); } void nvme_mpath_remove_disk(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7aad581271c7..f20076f6f06a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -506,7 +506,10 @@ struct nvme_ns_head { struct work_struct partition_scan_work; struct mutex lock; unsigned long flags; -#define NVME_NSHEAD_DISK_LIVE 0 + struct delayed_work remove_work; + unsigned int delayed_removal_secs; +#define NVME_NSHEAD_DISK_LIVE 0 +#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1 struct nvme_ns __rcu *current_path[]; #endif }; @@ -989,12 +992,19 @@ extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute dev_attr_queue_depth; extern struct device_attribute dev_attr_numa_nodes; +extern struct device_attribute dev_attr_delayed_removal_secs; extern struct device_attribute subsys_attr_iopolicy; static inline bool nvme_disk_is_ns_head(struct gendisk *disk) { return disk->fops == &nvme_ns_head_ops; } +static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head) +{ + if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags)) + return true; + return false; +} #else #define multipath false static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) @@ -1082,6 +1092,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk) { return false; } +static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head) +{ + return false; +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16], diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 6d31226f7a4f..a48d30c31d51 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = { &dev_attr_ana_state.attr, &dev_attr_queue_depth.attr, &dev_attr_numa_nodes.attr, + &dev_attr_delayed_removal_secs.attr, #endif &dev_attr_io_passthru_err_log_enabled.attr, NULL, @@ -296,6 +297,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, if (nvme_disk_is_ns_head(dev_to_disk(dev))) return 0; } + if (a == &dev_attr_delayed_removal_secs.attr) { + struct gendisk *disk = dev_to_disk(dev); + + if (!nvme_disk_is_ns_head(disk)) + return 0; + } #endif return a->mode; } From 737af5f0011a400c79c7fa7bce2f5bcb69be35d7 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 14 May 2025 18:33:16 +0530 Subject: [PATCH 41/42] nvme: introduce multipath_always_on module param Currently, a multipath head disk node is not created for single- ported NVMe adapters or private namespaces with non-unique NSID. However, creating a head node in these cases can help transparently handle transient PCIe link failures. Without a head node, features like delayed removal cannot be leveraged, making it difficult to tolerate such link failures. To address this, this commit introduces nvme_core module parameter multipath_always_on. When multipath_always_on is set to true, it forces the creation of a multipath head node regardless NVMe disk or namespace type. So this option allows the use of delayed removal of head node functionality even for single-ported NVMe disks and private namespaces with a unique NSID and thus helps transparently handle transient PCIe link failures. By default multipath_always_on is set to false, thus preserving the existing behavior. Setting it to true enables improved fault tolerance in PCIe setups. Moreover, please note that enabling this option would also implicitly enable nvme_core.multipath. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Nilay Shroff Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 72 ++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 2db326d6114f..38b420fb63f0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -10,10 +10,61 @@ #include "nvme.h" bool multipath = true; -module_param(multipath, bool, 0444); +static bool multipath_always_on; + +static int multipath_param_set(const char *val, const struct kernel_param *kp) +{ + int ret; + bool *arg = kp->arg; + + ret = param_set_bool(val, kp); + if (ret) + return ret; + + if (multipath_always_on && !*arg) { + pr_err("Can't disable multipath when multipath_always_on is configured.\n"); + *arg = true; + return -EINVAL; + } + + return 0; +} + +static const struct kernel_param_ops multipath_param_ops = { + .set = multipath_param_set, + .get = param_get_bool, +}; + +module_param_cb(multipath, &multipath_param_ops, &multipath, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +static int multipath_always_on_set(const char *val, + const struct kernel_param *kp) +{ + int ret; + bool *arg = kp->arg; + + ret = param_set_bool(val, kp); + if (ret < 0) + return ret; + + if (*arg) + multipath = true; + + return 0; +} + +static const struct kernel_param_ops multipath_always_on_ops = { + .set = multipath_always_on_set, + .get = param_get_bool, +}; + +module_param_cb(multipath_always_on, &multipath_always_on_ops, + &multipath_always_on, 0444); +MODULE_PARM_DESC(multipath_always_on, + "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support"); + static const char *nvme_iopolicy_names[] = { [NVME_IOPOLICY_NUMA] = "numa", [NVME_IOPOLICY_RR] = "round-robin", @@ -674,12 +725,21 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) head->delayed_removal_secs = 0; /* - * Add a multipath node if the subsystems supports multiple controllers. - * We also do this for private namespaces as the namespace sharing flag - * could change after a rescan. + * If "multipath_always_on" is enabled, a multipath node is added + * regardless of whether the disk is single/multi ported, and whether + * the namespace is shared or private. If "multipath_always_on" is not + * enabled, a multipath node is added only if the subsystem supports + * multiple controllers and the "multipath" option is configured. In + * either case, for private namespaces, we ensure that the NSID is + * unique. */ - if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || - !nvme_is_unique_nsid(ctrl, head) || !multipath) + if (!multipath_always_on) { + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || + !multipath) + return 0; + } + + if (!nvme_is_unique_nsid(ctrl, head)) return 0; blk_set_stacking_limits(&lim); From 9e221d8cf90b8599a6a3d62a1ebb712468f42a35 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 14 May 2025 18:33:17 +0530 Subject: [PATCH 42/42] nvme: rename nvme_mpath_shutdown_disk to nvme_mpath_remove_disk In the NVMe context, the term "shutdown" has a specific technical meaning. To avoid confusion, this commit renames the nvme_mpath_ shutdown_disk function to nvme_mpath_remove_disk to better reflect its purpose (i.e. removing the disk from the system). However, nvme_mpath_remove_disk was already in use, and its functionality is related to releasing or putting the head node disk. To resolve this naming conflict and improve clarity, the existing nvme_mpath_ remove_disk function is also renamed to nvme_mpath_put_disk. This renaming improves code readability and better aligns function names with their actual roles. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Nilay Shroff Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/multipath.c | 16 ++++++++-------- drivers/nvme/host/nvme.h | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6d11f8b19633..cdb806b2c89c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -668,7 +668,7 @@ static void nvme_free_ns_head(struct kref *ref) struct nvme_ns_head *head = container_of(ref, struct nvme_ns_head, ref); - nvme_mpath_remove_disk(head); + nvme_mpath_put_disk(head); ida_free(&head->subsys->ns_ida, head->instance); cleanup_srcu_struct(&head->srcu); nvme_put_subsystem(head->subsys); @@ -4214,7 +4214,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) synchronize_srcu(&ns->ctrl->srcu); if (last_path) - nvme_mpath_shutdown_disk(ns->head); + nvme_mpath_remove_disk(ns->head); nvme_put_ns(ns); } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 38b420fb63f0..3fdbbe1fdbc4 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -698,15 +698,15 @@ static void nvme_remove_head_work(struct work_struct *work) { struct nvme_ns_head *head = container_of(to_delayed_work(work), struct nvme_ns_head, remove_work); - bool shutdown = false; + bool remove = false; mutex_lock(&head->subsys->lock); if (list_empty(&head->list)) { list_del_init(&head->entry); - shutdown = true; + remove = true; } mutex_unlock(&head->subsys->lock); - if (shutdown) + if (remove) nvme_remove_head(head); module_put(THIS_MODULE); @@ -1286,9 +1286,9 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) #endif } -void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +void nvme_mpath_remove_disk(struct nvme_ns_head *head) { - bool shutdown = false; + bool remove = false; mutex_lock(&head->subsys->lock); /* @@ -1314,15 +1314,15 @@ void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) head->delayed_removal_secs * HZ); } else { list_del_init(&head->entry); - shutdown = true; + remove = true; } out: mutex_unlock(&head->subsys->lock); - if (shutdown) + if (remove) nvme_remove_head(head); } -void nvme_mpath_remove_disk(struct nvme_ns_head *head) +void nvme_mpath_put_disk(struct nvme_ns_head *head) { if (!head->disk) return; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f20076f6f06a..1de1b843afa5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -966,7 +966,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns); void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns); void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid); -void nvme_mpath_remove_disk(struct nvme_ns_head *head); +void nvme_mpath_put_disk(struct nvme_ns_head *head); int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); void nvme_mpath_update(struct nvme_ctrl *ctrl); @@ -975,7 +975,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_revalidate_paths(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); -void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); +void nvme_mpath_remove_disk(struct nvme_ns_head *head); void nvme_mpath_start_request(struct request *rq); void nvme_mpath_end_request(struct request *rq); @@ -1025,7 +1025,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) { } -static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) +static inline void nvme_mpath_put_disk(struct nvme_ns_head *head) { } static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns) @@ -1044,7 +1044,7 @@ static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns) static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { } -static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) { } static inline void nvme_trace_bio_complete(struct request *req)