From 25802f3ab7a589d2b5d9e1266acffad263d9893e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:25 +0200 Subject: [PATCH 01/31] nvme-rdma: move NVME_RDMA_IP_PORT from common file The correct place for this definition is the nvme rdma header file and not the common nvme header file. Reviewed-by: Christoph Hellwig Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme-rdma.h | 2 ++ include/linux/nvme.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 4dd7e6fe92fb0..146dd2223a5fe 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -6,6 +6,8 @@ #ifndef _LINUX_NVME_RDMA_H #define _LINUX_NVME_RDMA_H +#define NVME_RDMA_IP_PORT 4420 + #define NVME_RDMA_MAX_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bc605ec4a3fd0..ce0c1143c7e4f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -23,8 +23,6 @@ #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" -#define NVME_RDMA_IP_PORT 4420 - #define NVME_NSID_ALL 0xffffffff enum nvme_subsys_type { From 143667ee9a9cb88b6da7fe6a3d0f32bc33d75d71 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:26 +0200 Subject: [PATCH 02/31] nvmet: compare mqes and sqsize only for IO SQ According to the NVMe Spec: " MQES: This field indicates the maximum individual queue size that the controller supports. For NVMe over PCIe implementations, this value applies to the I/O Submission Queues and I/O Completion Queues that the host creates. For NVMe over Fabrics implementations, this value applies to only the I/O Submission Queues that the host creates. " Align the target code to compare mqes and sqsize as mentioned in the NVMe Spec. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/target/fabrics-cmd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index d8da840a1c0ed..4d014c5d0b6a7 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -157,7 +157,8 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } - if (sqsize > mqes) { + /* for fabrics, this value applies to only the I/O Submission Queues */ + if (qid && sqsize > mqes) { pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n", sqsize, mqes, ctrl->cntlid); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); From 63e8fd6240f08ddf3cffec73dfb90f9594a3a0c6 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:27 +0200 Subject: [PATCH 03/31] nvmet: set maxcmd to be per controller This is a preparation for having a dynamic configuration of max queue size for a controller. Make sure that the maxcmd field stays the same as the MQES (+1) value as we do today. Reviewed-by: Christoph Hellwig Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/discovery.c | 2 +- drivers/nvme/target/nvmet.h | 2 +- drivers/nvme/target/passthru.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 39cb570f833dd..f5b7054a4a05e 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -428,7 +428,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->cqes = (0x4 << 4) | 0x4; /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ - id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl)); id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES); id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 68e82ccc0e4e3..ce54da8c6b366 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -282,7 +282,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) id->lpa = (1 << 2); /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ - id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl)); id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ if (ctrl->ops->flags & NVMF_KEYED_SGLS) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 6c8acebe1a1a6..144aca2fa6ad6 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -545,7 +545,7 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, #define NVMET_QUEUE_SIZE 1024 #define NVMET_NR_QUEUES 128 -#define NVMET_MAX_CMD NVMET_QUEUE_SIZE +#define NVMET_MAX_CMD(ctrl) (NVME_CAP_MQES(ctrl->cap) + 1) /* * Nice round number that makes a list of nsids fit into a page. diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index f2d963e1fe94e..bb4a69d538fd1 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -132,7 +132,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes); id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes); - id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl)); /* don't support fuse commands */ id->fuses = 0; From c82c370dcabefb60eeeb6d4b364e0b88951c3546 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:28 +0200 Subject: [PATCH 04/31] nvmet: set ctrl pi_support cap before initializing cap reg This is a preparation for setting the maximal queue size of a controller that supports PI. Reviewed-by: Christoph Hellwig Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 1 + drivers/nvme/target/fabrics-cmd.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 8658e9c08534d..5d50f731c326a 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1411,6 +1411,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, kref_init(&ctrl->ref); ctrl->subsys = subsys; + ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; nvmet_init_cap(ctrl); WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 4d014c5d0b6a7..08e9c6b6f5517 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -252,8 +252,6 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) if (status) goto out; - ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; - uuid_copy(&ctrl->hostid, &d->hostid); ret = nvmet_setup_auth(ctrl); From 36144964062b8676ee64281852de2a2c1b193aca Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:29 +0200 Subject: [PATCH 05/31] nvme-rdma: introduce NVME_RDMA_MAX_METADATA_QUEUE_SIZE definition This definition will be used by controllers that are configured with metadata support. For now, both regular and metadata controllers have the same maximal queue size but later commit will increase the maximal queue size for regular RDMA controllers to 256. We'll keep the maximal queue size for metadata controllers to be 128 since there are more resources that are needed for metadata operations and 128 is the optimal size found for metadata controllers base on testing. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 2 ++ include/linux/nvme-rdma.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3a0f2c170f4c1..b3f8416ec8030 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -2015,6 +2015,8 @@ static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl) { + if (ctrl->pi_support) + return NVME_RDMA_MAX_METADATA_QUEUE_SIZE; return NVME_RDMA_MAX_QUEUE_SIZE; } diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 146dd2223a5fe..d0b9941911a1c 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -8,7 +8,8 @@ #define NVME_RDMA_IP_PORT 4420 -#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, From ad178ba9d90a58229f2d4ef57eb0d5eb37acbed9 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:30 +0200 Subject: [PATCH 06/31] nvme-rdma: clamp queue size according to ctrl cap If a controller is configured with metadata support, clamp the maximal queue size to be 128 since there are more resources that are needed for metadata operations. Otherwise, clamp it to 256. Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 20fdd40b1879f..366f0bb4ebfc1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1006,6 +1006,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) { int ret; bool changed; + u16 max_queue_size; ret = nvme_rdma_configure_admin_queue(ctrl, new); if (ret) @@ -1030,11 +1031,16 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); } - if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) { + if (ctrl->ctrl.max_integrity_segments) + max_queue_size = NVME_RDMA_MAX_METADATA_QUEUE_SIZE; + else + max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; + + if (ctrl->ctrl.sqsize + 1 > max_queue_size) { dev_warn(ctrl->ctrl.device, - "ctrl sqsize %u > max queue size %u, clamping down\n", - ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE); - ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1; + "ctrl sqsize %u > max queue size %u, clamping down\n", + ctrl->ctrl.sqsize + 1, max_queue_size); + ctrl->ctrl.sqsize = max_queue_size - 1; } if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { From ca2b221d89a81849a2f4a25d024a3d527a210ab6 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:31 +0200 Subject: [PATCH 07/31] nvmet: introduce new max queue size configuration entry Using this port configuration, one will be able to set the maximal queue size to be used for any controller that will be associated to the configured port. The default value stayed 1024 but each transport will be able to set the its own values before enabling the port. Introduce lower limit of 16 for minimal queue depth (same as we use in the host fabrics drivers). Reviewed-by: Christoph Hellwig Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Guixin Liu Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 28 ++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 17 +++++++++++++++-- drivers/nvme/target/nvmet.h | 4 +++- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 2482a0db25043..77a6e817b3159 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -273,6 +273,32 @@ static ssize_t nvmet_param_inline_data_size_store(struct config_item *item, CONFIGFS_ATTR(nvmet_, param_inline_data_size); +static ssize_t nvmet_param_max_queue_size_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", port->max_queue_size); +} + +static ssize_t nvmet_param_max_queue_size_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + int ret; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + ret = kstrtoint(page, 0, &port->max_queue_size); + if (ret) { + pr_err("Invalid value '%s' for max_queue_size\n", page); + return -EINVAL; + } + return count; +} + +CONFIGFS_ATTR(nvmet_, param_max_queue_size); + #ifdef CONFIG_BLK_DEV_INTEGRITY static ssize_t nvmet_param_pi_enable_show(struct config_item *item, char *page) @@ -1859,6 +1885,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = { &nvmet_attr_addr_trtype, &nvmet_attr_addr_tsas, &nvmet_attr_param_inline_data_size, + &nvmet_attr_param_max_queue_size, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_attr_param_pi_enable, #endif @@ -1917,6 +1944,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, INIT_LIST_HEAD(&port->subsystems); INIT_LIST_HEAD(&port->referrals); port->inline_data_size = -1; /* < 0 == let the transport choose */ + port->max_queue_size = -1; /* < 0 == let the transport choose */ port->disc_addr.portid = cpu_to_le16(portid); port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 5d50f731c326a..6bbe4df0166ca 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -358,6 +358,18 @@ int nvmet_enable_port(struct nvmet_port *port) if (port->inline_data_size < 0) port->inline_data_size = 0; + /* + * If the transport didn't set the max_queue_size properly, then clamp + * it to the target limits. Also set default values in case the + * transport didn't set it at all. + */ + if (port->max_queue_size < 0) + port->max_queue_size = NVMET_MAX_QUEUE_SIZE; + else + port->max_queue_size = clamp_t(int, port->max_queue_size, + NVMET_MIN_QUEUE_SIZE, + NVMET_MAX_QUEUE_SIZE); + port->enabled = true; port->tr_ops = ops; return 0; @@ -1223,9 +1235,10 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl) ctrl->cap |= (15ULL << 24); /* maximum queue entries supported: */ if (ctrl->ops->get_max_queue_size) - ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1; + ctrl->cap |= min_t(u16, ctrl->ops->get_max_queue_size(ctrl), + ctrl->port->max_queue_size) - 1; else - ctrl->cap |= NVMET_QUEUE_SIZE - 1; + ctrl->cap |= ctrl->port->max_queue_size - 1; if (nvmet_is_passthru_subsys(ctrl->subsys)) nvmet_passthrough_override_cap(ctrl); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 144aca2fa6ad6..7c6e7e65b0329 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -163,6 +163,7 @@ struct nvmet_port { void *priv; bool enabled; int inline_data_size; + int max_queue_size; const struct nvmet_fabrics_ops *tr_ops; bool pi_enable; }; @@ -543,7 +544,8 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, u8 event_info, u8 log_page); -#define NVMET_QUEUE_SIZE 1024 +#define NVMET_MIN_QUEUE_SIZE 16 +#define NVMET_MAX_QUEUE_SIZE 1024 #define NVMET_NR_QUEUES 128 #define NVMET_MAX_CMD(ctrl) (NVME_CAP_MQES(ctrl->cap) + 1) From f096ba3286f5e773c496cf81667d01f2e8a2a37b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:32 +0200 Subject: [PATCH 08/31] nvmet-rdma: set max_queue_size for RDMA transport A new port configuration was added to set max_queue_size. Clamp user configuration to RDMA transport limits. Increase the maximal queue size of RDMA controllers from 128 to 256 (the default size stays 128 same as before). Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 8 ++++++++ include/linux/nvme-rdma.h | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index b3f8416ec8030..f2bb9d95ecf4b 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1956,6 +1956,14 @@ static int nvmet_rdma_add_port(struct nvmet_port *nport) nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; } + if (nport->max_queue_size < 0) { + nport->max_queue_size = NVME_RDMA_DEFAULT_QUEUE_SIZE; + } else if (nport->max_queue_size > NVME_RDMA_MAX_QUEUE_SIZE) { + pr_warn("max_queue_size %u is too large, reducing to %u\n", + nport->max_queue_size, NVME_RDMA_MAX_QUEUE_SIZE); + nport->max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; + } + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, nport->disc_addr.trsvcid, &port->addr); if (ret) { diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index d0b9941911a1c..eb2f04d636c89 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -8,8 +8,9 @@ #define NVME_RDMA_IP_PORT 4420 -#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_QUEUE_SIZE 256 #define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 +#define NVME_RDMA_DEFAULT_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, From 4999568184e5d68903e6d0e49609979cd6ef01d7 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 28 Feb 2024 10:37:59 +0800 Subject: [PATCH 09/31] nvme-fabrics: check max outstanding commands Maxcmd is mandatory for fabrics, check it early to identify the root cause instead of waiting for it to propagate to "sqsize" and "allocing queue". By the way, change nvme_check_ctrl_fabric_info() to nvmf_validate_identify_ctrl(). Reviewed-by: Chaitanya Kulkarni Signed-off-by: Guixin Liu Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index eed3e22e24d91..cb13f7c79eaf9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3119,6 +3119,11 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct return -EINVAL; } + if (!ctrl->maxcmd) { + dev_err(ctrl->device, "Maximum outstanding commands is 0\n"); + return -EINVAL; + } + return 0; } From 152694c82950a0930533dbe972b1f4fc11b95e98 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:45 -0700 Subject: [PATCH 10/31] nvme: set max_hw_sectors unconditionally All transports set a max_hw_sectors value in the nvme_ctrl, so make the code using it unconditional and clean it up using a little helper. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Reviewed-by: John Garry Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cb13f7c79eaf9..6ae9aedf7bc27 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1944,19 +1944,19 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl, return 0; } +static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) +{ + return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1; +} + static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, struct request_queue *q) { bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; - if (ctrl->max_hw_sectors) { - u32 max_segments = - (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; - - max_segments = min_not_zero(max_segments, ctrl->max_segments); - blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); - blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); - } + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); + blk_queue_max_segments(q, min_t(u32, USHRT_MAX, + min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments))); blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); blk_queue_dma_alignment(q, 3); blk_queue_write_cache(q, vwc, vwc); From 63dfa1004322d596417f23da43cdc43cf6298c71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:46 -0700 Subject: [PATCH 11/31] nvme: move NVME_QUIRK_DEALLOCATE_ZEROES out of nvme_config_discard Move the handling of the NVME_QUIRK_DEALLOCATE_ZEROES quirk out of nvme_config_discard so that it is combined with the normal write_zeroes limit handling. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6ae9aedf7bc27..a6c0b2f4cf796 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1816,9 +1816,6 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, else blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); queue->limits.discard_granularity = queue_logical_block_size(queue); - - if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) - blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) @@ -2029,8 +2026,12 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, set_capacity_and_notify(disk, capacity); nvme_config_discard(ctrl, disk, head); - blk_queue_max_write_zeroes_sectors(disk->queue, - ctrl->max_zeroes_sectors); + + if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX); + else + blk_queue_max_write_zeroes_sectors(disk->queue, + ctrl->max_zeroes_sectors); } static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info) From 1b2f5d5d288080ea10b4e2ed595c0dfb11557c17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:47 -0700 Subject: [PATCH 12/31] nvme: remove nvme_revalidate_zones Handle setting the zone size / chunk_sectors and max_append_sectors limits together with the other ZNS limits, and just open code the call to blk_revalidate_zones in the current place. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/nvme.h | 1 - drivers/nvme/host/zns.c | 12 ++---------- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a6c0b2f4cf796..2817eea07e967 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2154,7 +2154,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { - ret = nvme_revalidate_zones(ns); + ret = blk_revalidate_disk_zones(ns->disk, NULL); if (ret && !nvme_first_scan(ns->disk)) goto out; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4a484fc8a073c..01e8bae788658 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1036,7 +1036,6 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk) } #endif /* CONFIG_NVME_MULTIPATH */ -int nvme_revalidate_zones(struct nvme_ns *ns); int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); #ifdef CONFIG_BLK_DEV_ZONED diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 499bbb0eee8d0..852261d789136 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -7,16 +7,6 @@ #include #include "nvme.h" -int nvme_revalidate_zones(struct nvme_ns *ns) -{ - struct request_queue *q = ns->queue; - - blk_queue_chunk_sectors(q, ns->head->zsze); - blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); - - return blk_revalidate_disk_zones(ns->disk, NULL); -} - static int nvme_set_max_append(struct nvme_ctrl *ctrl) { struct nvme_command c = { }; @@ -113,6 +103,8 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1); disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1); + blk_queue_chunk_sectors(ns->queue, ns->head->zsze); + blk_queue_max_zone_append_sectors(ns->queue, ns->ctrl->max_zone_append); free_data: kfree(id); return status; From f404dd928b6667b383e684f2bd8cce507e031481 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:48 -0700 Subject: [PATCH 13/31] nvme: move max_integrity_segments handling out of nvme_init_integrity max_integrity_segments is just a hardware limit and doesn't need to be in nvme_init_integrity with the PI setup. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2817eea07e967..c4a268d91796f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1724,8 +1724,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_init_integrity(struct gendisk *disk, - struct nvme_ns_head *head, u32 max_integrity_segments) +static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) { struct blk_integrity integrity = { }; @@ -1773,11 +1772,9 @@ static void nvme_init_integrity(struct gendisk *disk, integrity.tuple_size = head->ms; integrity.pi_offset = head->pi_offset; blk_integrity_register(disk, &integrity); - blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); } #else -static void nvme_init_integrity(struct gendisk *disk, - struct nvme_ns_head *head, u32 max_integrity_segments) +static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ @@ -1954,6 +1951,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, USHRT_MAX, min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments))); + blk_queue_max_integrity_segments(q, ctrl->max_integrity_segments); blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); blk_queue_dma_alignment(q, 3); blk_queue_write_cache(q, vwc, vwc); @@ -2017,8 +2015,7 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, if (head->ms) { if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && (head->features & NVME_NS_METADATA_SUPPORTED)) - nvme_init_integrity(disk, head, - ctrl->max_integrity_segments); + nvme_init_integrity(disk, head); else if (!nvme_ns_has_pi(head)) capacity = 0; } From f467b48e38a60c64d73619145247c550d8edf82f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:49 -0700 Subject: [PATCH 14/31] nvme: cleanup the nvme_init_integrity calling conventions Handle the no metadata support case in nvme_init_integrity as well to simplify the calling convention and prepare for future changes in the area. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c4a268d91796f..1ecddb6f5ad9e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1723,11 +1723,21 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -#ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) +static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) { struct blk_integrity integrity = { }; + if (!head->ms) + return true; + + /* + * PI can always be supported as we can ask the controller to simply + * insert/strip it, which is not possible for other kinds of metadata. + */ + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) || + !(head->features & NVME_NS_METADATA_SUPPORTED)) + return nvme_ns_has_pi(head); + switch (head->pi_type) { case NVME_NS_DPS_PI_TYPE3: switch (head->guard_type) { @@ -1772,12 +1782,8 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) integrity.tuple_size = head->ms; integrity.pi_offset = head->pi_offset; blk_integrity_register(disk, &integrity); + return true; } -#else -static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) -{ -} -#endif /* CONFIG_BLK_DEV_INTEGRITY */ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, struct nvme_ns_head *head) @@ -2012,13 +2018,8 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, * I/O to namespaces with metadata except when the namespace supports * PI, as it can strip/insert in that case. */ - if (head->ms) { - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && - (head->features & NVME_NS_METADATA_SUPPORTED)) - nvme_init_integrity(disk, head); - else if (!nvme_ns_has_pi(head)) - capacity = 0; - } + if (!nvme_init_integrity(disk, head)) + capacity = 0; set_capacity_and_notify(disk, capacity); From 414c62e2ce5d93bfbdc12048530075dcea02cad8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:50 -0700 Subject: [PATCH 15/31] nvme: move blk_integrity_unregister into nvme_init_integrity Move uneregistering the existing integrity profile into the helper dealing with all the other integrity / metadata setup. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1ecddb6f5ad9e..40fab7c47eae2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1727,6 +1727,8 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) { struct blk_integrity integrity = { }; + blk_integrity_unregister(disk); + if (!head->ms) return true; @@ -1980,8 +1982,6 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, bs = (1 << 9); } - blk_integrity_unregister(disk); - atomic_bs = phys_bs = bs; if (id->nabo == 0) { /* From 8f03cfa117e06bd2d3ba7ed8bba70a3dda310cae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:51 -0700 Subject: [PATCH 16/31] nvme: don't use nvme_update_disk_info for the multipath disk Currently nvme_update_ns_info_block calls nvme_update_disk_info both for the namespace attached disk, and the multipath one (if it exists). This is very different from how other stacking drivers work, and leads to a lot of complexity. Switch to setting the disk capacity and initializing the integrity profile, and let blk_stack_limits which already is called just below deal with updating the other limits. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 40fab7c47eae2..d356f3fa2cf8e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2159,7 +2159,8 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (nvme_ns_head_multipath(ns->head)) { blk_mq_freeze_queue(ns->head->disk->queue); - nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id); + nvme_init_integrity(ns->head->disk, ns->head); + set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); nvme_mpath_revalidate_paths(ns); blk_stack_limits(&ns->head->disk->queue->limits, From a5b1cd61820e88d90454ad87154856a7a20aafbf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:52 -0700 Subject: [PATCH 17/31] nvme: move a few things out of nvme_update_disk_info Move setting up the integrity profile and setting the disk capacity out of nvme_update_disk_info to get nvme_update_disk_info into a shape where it just sets queue_limits eventually. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 46 +++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d356f3fa2cf8e..63c2b581f7859 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1965,12 +1965,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_write_cache(q, vwc, vwc); } -static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, - struct nvme_ns_head *head, struct nvme_id_ns *id) +static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id) { - sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze)); + struct gendisk *disk = ns->disk; + struct nvme_ns_head *head = ns->head; u32 bs = 1U << head->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; + bool valid = true; /* * The block layer can't support LBA sizes larger than the page size @@ -1978,8 +1979,8 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, * allow block I/O. */ if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) { - capacity = 0; bs = (1 << 9); + valid = false; } atomic_bs = phys_bs = bs; @@ -1992,7 +1993,7 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else - atomic_bs = (1 + ctrl->subsys->awupf) * bs; + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; } if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { @@ -2012,24 +2013,14 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, blk_queue_io_min(disk->queue, phys_bs); blk_queue_io_opt(disk->queue, io_opt); - /* - * Register a metadata profile for PI, or the plain non-integrity NVMe - * metadata masquerading as Type 0 if supported, otherwise reject block - * I/O to namespaces with metadata except when the namespace supports - * PI, as it can strip/insert in that case. - */ - if (!nvme_init_integrity(disk, head)) - capacity = 0; - - set_capacity_and_notify(disk, capacity); - - nvme_config_discard(ctrl, disk, head); + nvme_config_discard(ns->ctrl, disk, head); - if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX); else blk_queue_max_write_zeroes_sectors(disk->queue, - ctrl->max_zeroes_sectors); + ns->ctrl->max_zeroes_sectors); + return valid; } static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info) @@ -2103,6 +2094,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { struct nvme_id_ns *id; + sector_t capacity; unsigned lbaf; int ret; @@ -2121,6 +2113,8 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, lbaf = nvme_lbaf_index(id->flbas); ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->nuse = le64_to_cpu(id->nuse); + capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); + nvme_set_queue_limits(ns->ctrl, ns->queue); ret = nvme_configure_metadata(ns->ctrl, ns->head, id); @@ -2129,7 +2123,19 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } nvme_set_chunk_sectors(ns, id); - nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id); + if (!nvme_update_disk_info(ns, id)) + capacity = 0; + + /* + * Register a metadata profile for PI, or the plain non-integrity NVMe + * metadata masquerading as Type 0 if supported, otherwise reject block + * I/O to namespaces with metadata except when the namespace supports + * PI, as it can strip/insert in that case. + */ + if (!nvme_init_integrity(ns->disk, ns->head)) + capacity = 0; + + set_capacity_and_notify(ns->disk, capacity); if (ns->head->ids.csi == NVME_CSI_ZNS) { ret = nvme_update_zone_info(ns, lbaf); From d60c23e4552b04fda600c6a8681dfe57b3ee2bd8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:53 -0700 Subject: [PATCH 18/31] nvme: move setting the write cache flags out of nvme_set_queue_limits nvme_set_queue_limits is used on the admin queue and all gendisks including hidden ones that don't support block I/O. The write cache setting on the other hand only makes sense for block I/O. Move the blk_queue_write_cache call to nvme_update_ns_info_block instead. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 63c2b581f7859..ce70bfb66242e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1954,15 +1954,12 @@ static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, struct request_queue *q) { - bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; - blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, USHRT_MAX, min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments))); blk_queue_max_integrity_segments(q, ctrl->max_integrity_segments); blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); blk_queue_dma_alignment(q, 3); - blk_queue_write_cache(q, vwc, vwc); } static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id) @@ -2093,6 +2090,7 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { + bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; struct nvme_id_ns *id; sector_t capacity; unsigned lbaf; @@ -2154,6 +2152,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) ns->head->features |= NVME_NS_DEAC; set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); + blk_queue_write_cache(ns->disk->queue, vwc, vwc); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); From 46e7422cda8482aa3074c9caf4c224cf2fb74d71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:54 -0700 Subject: [PATCH 19/31] nvme: move common logic into nvme_update_ns_info nvme_update_ns_info_generic and nvme_update_ns_info_block share a fair amount of logic related to not fully supported namespace formats and updating the multipath information. Move this logic into the common caller. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 84 ++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce70bfb66242e..f0686a872d0e3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2070,21 +2070,8 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); blk_mq_unfreeze_queue(ns->disk->queue); - if (nvme_ns_head_multipath(ns->head)) { - blk_mq_freeze_queue(ns->head->disk->queue); - set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); - nvme_mpath_revalidate_paths(ns); - blk_stack_limits(&ns->head->disk->queue->limits, - &ns->queue->limits, 0); - ns->head->disk->flags |= GENHD_FL_HIDDEN; - blk_mq_unfreeze_queue(ns->head->disk->queue); - } - /* Hide the block-interface for these devices */ - ns->disk->flags |= GENHD_FL_HIDDEN; - set_bit(NVME_NS_READY, &ns->flags); - - return 0; + return -ENODEV; } static int nvme_update_ns_info_block(struct nvme_ns *ns, @@ -2104,7 +2091,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, /* namespace not allocated or attached */ info->is_removed = true; ret = -ENODEV; - goto error; + goto out; } blk_mq_freeze_queue(ns->disk->queue); @@ -2162,54 +2149,67 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } - if (nvme_ns_head_multipath(ns->head)) { - blk_mq_freeze_queue(ns->head->disk->queue); - nvme_init_integrity(ns->head->disk, ns->head); - set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); - set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); - nvme_mpath_revalidate_paths(ns); - blk_stack_limits(&ns->head->disk->queue->limits, - &ns->queue->limits, 0); - disk_update_readahead(ns->head->disk); - blk_mq_unfreeze_queue(ns->head->disk->queue); - } - ret = 0; out: - /* - * If probing fails due an unsupported feature, hide the block device, - * but still allow other access. - */ - if (ret == -ENODEV) { - ns->disk->flags |= GENHD_FL_HIDDEN; - set_bit(NVME_NS_READY, &ns->flags); - ret = 0; - } - -error: kfree(id); return ret; } static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) { + bool unsupported = false; + int ret; + switch (info->ids.csi) { case NVME_CSI_ZNS: if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { dev_info(ns->ctrl->device, "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", info->nsid); - return nvme_update_ns_info_generic(ns, info); + ret = nvme_update_ns_info_generic(ns, info); + break; } - return nvme_update_ns_info_block(ns, info); + ret = nvme_update_ns_info_block(ns, info); + break; case NVME_CSI_NVM: - return nvme_update_ns_info_block(ns, info); + ret = nvme_update_ns_info_block(ns, info); + break; default: dev_info(ns->ctrl->device, "block device for nsid %u not supported (csi %u)\n", info->nsid, info->ids.csi); - return nvme_update_ns_info_generic(ns, info); + ret = nvme_update_ns_info_generic(ns, info); + break; } + + /* + * If probing fails due an unsupported feature, hide the block device, + * but still allow other access. + */ + if (ret == -ENODEV) { + ns->disk->flags |= GENHD_FL_HIDDEN; + set_bit(NVME_NS_READY, &ns->flags); + unsupported = true; + ret = 0; + } + + if (!ret && nvme_ns_head_multipath(ns->head)) { + blk_mq_freeze_queue(ns->head->disk->queue); + if (unsupported) + ns->head->disk->flags |= GENHD_FL_HIDDEN; + else + nvme_init_integrity(ns->head->disk, ns->head); + set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); + set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); + nvme_mpath_revalidate_paths(ns); + blk_stack_limits(&ns->head->disk->queue->limits, + &ns->queue->limits, 0); + + disk_update_readahead(ns->head->disk); + blk_mq_unfreeze_queue(ns->head->disk->queue); + } + + return ret; } #ifdef CONFIG_BLK_SED_OPAL From c6fce9f12764c11ab6ff34f4cb0dec06d4d87099 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:55 -0700 Subject: [PATCH 20/31] nvme: split out a nvme_identify_ns_nvm helper Split the logic to query the Identify Namespace Data Structure, NVM Command Set into a separate helper. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f0686a872d0e3..9abcc389f0f3a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1831,12 +1831,35 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) a->csi == b->csi; } +static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid, + struct nvme_id_ns_nvm **nvmp) +{ + struct nvme_command c = { + .identify.opcode = nvme_admin_identify, + .identify.nsid = cpu_to_le32(nsid), + .identify.cns = NVME_ID_CNS_CS_NS, + .identify.csi = NVME_CSI_NVM, + }; + struct nvme_id_ns_nvm *nvm; + int ret; + + nvm = kzalloc(sizeof(*nvm), GFP_KERNEL); + if (!nvm) + return -ENOMEM; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm)); + if (ret) + kfree(nvm); + else + *nvmp = nvm; + return ret; +} + static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, struct nvme_id_ns *id) { bool first = id->dps & NVME_NS_DPS_PI_FIRST; unsigned lbaf = nvme_lbaf_index(id->flbas); - struct nvme_command c = { }; struct nvme_id_ns_nvm *nvm; int ret = 0; u32 elbaf; @@ -1849,18 +1872,9 @@ static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, goto set_pi; } - nvm = kzalloc(sizeof(*nvm), GFP_KERNEL); - if (!nvm) - return -ENOMEM; - - c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(head->ns_id); - c.identify.cns = NVME_ID_CNS_CS_NS; - c.identify.csi = NVME_CSI_NVM; - - ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm)); + ret = nvme_identify_ns_nvm(ctrl, head->ns_id, &nvm); if (ret) - goto free_data; + goto set_pi; elbaf = le32_to_cpu(nvm->elbaf[lbaf]); From e5ea00a510c61e9e809a1f13286ac802bbe724a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:56 -0700 Subject: [PATCH 21/31] nvme: don't query identify data in configure_metadata Move reading the Identify Namespace Data Structure, NVM Command Set out of configure_metadata into the caller. This allows doing the identify call outside the frozen I/O queues, and prepares for using data from the Identify data structure for other purposes. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 49 ++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9abcc389f0f3a..a742fa10dd309 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1855,32 +1855,26 @@ static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid, return ret; } -static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, - struct nvme_id_ns *id) +static void nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, + struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm) { bool first = id->dps & NVME_NS_DPS_PI_FIRST; unsigned lbaf = nvme_lbaf_index(id->flbas); - struct nvme_id_ns_nvm *nvm; - int ret = 0; u32 elbaf; head->pi_size = 0; head->ms = le16_to_cpu(id->lbaf[lbaf].ms); - if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) { + if (!nvm || !(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) { head->pi_size = sizeof(struct t10_pi_tuple); head->guard_type = NVME_NVM_NS_16B_GUARD; goto set_pi; } - ret = nvme_identify_ns_nvm(ctrl, head->ns_id, &nvm); - if (ret) - goto set_pi; - elbaf = le32_to_cpu(nvm->elbaf[lbaf]); /* no support for storage tag formats right now */ if (nvme_elbaf_sts(elbaf)) - goto free_data; + goto set_pi; head->guard_type = nvme_elbaf_guard_type(elbaf); switch (head->guard_type) { @@ -1894,8 +1888,6 @@ static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, break; } -free_data: - kfree(nvm); set_pi: if (head->pi_size && head->ms >= head->pi_size) head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; @@ -1906,22 +1898,17 @@ static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, head->pi_offset = 0; else head->pi_offset = head->ms - head->pi_size; - - return ret; } -static int nvme_configure_metadata(struct nvme_ctrl *ctrl, - struct nvme_ns_head *head, struct nvme_id_ns *id) +static void nvme_configure_metadata(struct nvme_ctrl *ctrl, + struct nvme_ns_head *head, struct nvme_id_ns *id, + struct nvme_id_ns_nvm *nvm) { - int ret; - - ret = nvme_init_ms(ctrl, head, id); - if (ret) - return ret; + nvme_init_ms(ctrl, head, id, nvm); head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - return 0; + return; if (ctrl->ops->flags & NVME_F_FABRICS) { /* @@ -1930,7 +1917,7 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl, * remap the separate metadata buffer from the block layer. */ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) - return 0; + return; head->features |= NVME_NS_EXT_LBAS; @@ -1957,7 +1944,6 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl, else head->features |= NVME_NS_METADATA_SUPPORTED; } - return 0; } static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) @@ -2092,6 +2078,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; + struct nvme_id_ns_nvm *nvm = NULL; struct nvme_id_ns *id; sector_t capacity; unsigned lbaf; @@ -2108,6 +2095,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } + if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) { + ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm); + if (ret < 0) + goto out; + } + blk_mq_freeze_queue(ns->disk->queue); lbaf = nvme_lbaf_index(id->flbas); ns->head->lba_shift = id->lbaf[lbaf].ds; @@ -2115,12 +2108,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); nvme_set_queue_limits(ns->ctrl, ns->queue); - - ret = nvme_configure_metadata(ns->ctrl, ns->head, id); - if (ret < 0) { - blk_mq_unfreeze_queue(ns->disk->queue); - goto out; - } + nvme_configure_metadata(ns->ctrl, ns->head, id, nvm); nvme_set_chunk_sectors(ns, id); if (!nvme_update_disk_info(ns, id)) capacity = 0; @@ -2165,6 +2153,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, ret = 0; out: + kfree(nvm); kfree(id); return ret; } From 27cb91a3a102073473c6aaf0f7f0ba2db27dbf76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:57 -0700 Subject: [PATCH 22/31] nvme: cleanup nvme_configure_metadata Fold nvme_init_ms into nvme_configure_metadata after splitting up a little helper to deal with the extended LBA formats. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 47 ++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a742fa10dd309..2ecdde3619701 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1855,26 +1855,14 @@ static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid, return ret; } -static void nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, +static void nvme_configure_pi_elbas(struct nvme_ns_head *head, struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm) { - bool first = id->dps & NVME_NS_DPS_PI_FIRST; - unsigned lbaf = nvme_lbaf_index(id->flbas); - u32 elbaf; - - head->pi_size = 0; - head->ms = le16_to_cpu(id->lbaf[lbaf].ms); - if (!nvm || !(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) { - head->pi_size = sizeof(struct t10_pi_tuple); - head->guard_type = NVME_NVM_NS_16B_GUARD; - goto set_pi; - } - - elbaf = le32_to_cpu(nvm->elbaf[lbaf]); + u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]); /* no support for storage tag formats right now */ if (nvme_elbaf_sts(elbaf)) - goto set_pi; + return; head->guard_type = nvme_elbaf_guard_type(elbaf); switch (head->guard_type) { @@ -1887,29 +1875,32 @@ static void nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, default: break; } - -set_pi: - if (head->pi_size && head->ms >= head->pi_size) - head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; - else - head->pi_type = 0; - - if (first) - head->pi_offset = 0; - else - head->pi_offset = head->ms - head->pi_size; } static void nvme_configure_metadata(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm) { - nvme_init_ms(ctrl, head, id, nvm); - head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + head->pi_type = 0; + head->pi_size = 0; + head->pi_offset = 0; + head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms); if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) return; + if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) { + nvme_configure_pi_elbas(head, id, nvm); + } else { + head->pi_size = sizeof(struct t10_pi_tuple); + head->guard_type = NVME_NVM_NS_16B_GUARD; + } + + if (head->pi_size && head->ms >= head->pi_size) + head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; + if (!(id->dps & NVME_NS_DPS_PI_FIRST)) + head->pi_offset = head->ms - head->pi_size; + if (ctrl->ops->flags & NVME_F_FABRICS) { /* * The NVMe over Fabrics specification only supports metadata as From e6c9b130d68144381c097c90c517cc25e8f8924e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:58 -0700 Subject: [PATCH 23/31] nvme: use the atomic queue limits update API Changes the callchains that update queue_limits to build an on-stack queue_limits and update it atomically. Note that for now only the admin queue actually passes it to the queue allocation function. Doing the same for the gendisks used for the namespaces will require a little more work. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 133 ++++++++++++++++++++------------------- drivers/nvme/host/nvme.h | 10 +-- drivers/nvme/host/zns.c | 16 ++--- 3 files changed, 80 insertions(+), 79 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2ecdde3619701..6413ce24fb4b1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1787,40 +1787,27 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) return true; } -static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, - struct nvme_ns_head *head) +static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim) { - struct request_queue *queue = disk->queue; - u32 max_discard_sectors; - - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) { - max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl); - } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { - max_discard_sectors = UINT_MAX; - } else { - blk_queue_max_discard_sectors(queue, 0); - return; - } + struct nvme_ctrl *ctrl = ns->ctrl; BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - /* - * If discard is already enabled, don't reset queue limits. - * - * This works around the fact that the block layer can't cope well with - * updating the hardware limits when overridden through sysfs. This is - * harmless because discard limits in NVMe are purely advisory. - */ - if (queue->limits.max_discard_sectors) - return; + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX)) + lim->max_hw_discard_sectors = + nvme_lba_to_sect(ns->head, ctrl->dmrsl); + else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) + lim->max_hw_discard_sectors = UINT_MAX; + else + lim->max_hw_discard_sectors = 0; + + lim->discard_granularity = lim->logical_block_size; - blk_queue_max_discard_sectors(queue, max_discard_sectors); if (ctrl->dmrl) - blk_queue_max_discard_segments(queue, ctrl->dmrl); + lim->max_discard_segments = ctrl->dmrl; else - blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); - queue->limits.discard_granularity = queue_logical_block_size(queue); + lim->max_discard_segments = NVME_DSM_MAX_RANGES; } static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) @@ -1942,20 +1929,21 @@ static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1; } -static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, - struct request_queue *q) +static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl, + struct queue_limits *lim) { - blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); - blk_queue_max_segments(q, min_t(u32, USHRT_MAX, - min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments))); - blk_queue_max_integrity_segments(q, ctrl->max_integrity_segments); - blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); - blk_queue_dma_alignment(q, 3); + lim->max_hw_sectors = ctrl->max_hw_sectors; + lim->max_segments = min_t(u32, USHRT_MAX, + min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments)); + lim->max_integrity_segments = ctrl->max_integrity_segments; + lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1; + lim->max_segment_size = UINT_MAX; + lim->dma_alignment = 3; } -static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id) +static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, + struct queue_limits *lim) { - struct gendisk *disk = ns->disk; struct nvme_ns_head *head = ns->head; u32 bs = 1U << head->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; @@ -1991,23 +1979,19 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id) io_opt = bs * (1 + le16_to_cpu(id->nows)); } - blk_queue_logical_block_size(disk->queue, bs); /* * Linux filesystems assume writing a single physical block is * an atomic operation. Hence limit the physical block size to the * value of the Atomic Write Unit Power Fail parameter. */ - blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); - blk_queue_io_min(disk->queue, phys_bs); - blk_queue_io_opt(disk->queue, io_opt); - - nvme_config_discard(ns->ctrl, disk, head); - + lim->logical_block_size = bs; + lim->physical_block_size = min(phys_bs, atomic_bs); + lim->io_min = phys_bs; + lim->io_opt = io_opt; if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) - blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX); + lim->max_write_zeroes_sectors = UINT_MAX; else - blk_queue_max_write_zeroes_sectors(disk->queue, - ns->ctrl->max_zeroes_sectors); + lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors; return valid; } @@ -2022,7 +2006,8 @@ static inline bool nvme_first_scan(struct gendisk *disk) return !disk_live(disk); } -static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) +static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id, + struct queue_limits *lim) { struct nvme_ctrl *ctrl = ns->ctrl; u32 iob; @@ -2050,25 +2035,33 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) return; } - blk_queue_chunk_sectors(ns->queue, iob); + lim->chunk_sectors = iob; } static int nvme_update_ns_info_generic(struct nvme_ns *ns, struct nvme_ns_info *info) { + struct queue_limits lim; + int ret; + blk_mq_freeze_queue(ns->disk->queue); - nvme_set_queue_limits(ns->ctrl, ns->queue); + lim = queue_limits_start_update(ns->disk->queue); + nvme_set_ctrl_limits(ns->ctrl, &lim); + ret = queue_limits_commit_update(ns->disk->queue, &lim); set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); blk_mq_unfreeze_queue(ns->disk->queue); /* Hide the block-interface for these devices */ - return -ENODEV; + if (!ret) + ret = -ENODEV; + return ret; } static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; + struct queue_limits lim; struct nvme_id_ns_nvm *nvm = NULL; struct nvme_id_ns *id; sector_t capacity; @@ -2098,11 +2091,26 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, ns->head->nuse = le64_to_cpu(id->nuse); capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); - nvme_set_queue_limits(ns->ctrl, ns->queue); + lim = queue_limits_start_update(ns->disk->queue); + nvme_set_ctrl_limits(ns->ctrl, &lim); nvme_configure_metadata(ns->ctrl, ns->head, id, nvm); - nvme_set_chunk_sectors(ns, id); - if (!nvme_update_disk_info(ns, id)) + nvme_set_chunk_sectors(ns, id, &lim); + if (!nvme_update_disk_info(ns, id, &lim)) capacity = 0; + nvme_config_discard(ns, &lim); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + ns->head->ids.csi == NVME_CSI_ZNS) { + ret = nvme_update_zone_info(ns, lbaf, &lim); + if (ret) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } + } + ret = queue_limits_commit_update(ns->disk->queue, &lim); + if (ret) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } /* * Register a metadata profile for PI, or the plain non-integrity NVMe @@ -2115,14 +2123,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, set_capacity_and_notify(ns->disk, capacity); - if (ns->head->ids.csi == NVME_CSI_ZNS) { - ret = nvme_update_zone_info(ns, lbaf); - if (ret) { - blk_mq_unfreeze_queue(ns->disk->queue); - goto out; - } - } - /* * Only set the DEAC bit if the device guarantees that reads from * deallocated data return zeroes. While the DEAC bit does not @@ -3128,6 +3128,7 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct static int nvme_init_identify(struct nvme_ctrl *ctrl) { + struct queue_limits lim; struct nvme_id_ctrl *id; u32 max_hw_sectors; bool prev_apst_enabled; @@ -3194,7 +3195,12 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->max_hw_sectors = min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); - nvme_set_queue_limits(ctrl, ctrl->admin_q); + lim = queue_limits_start_update(ctrl->admin_q); + nvme_set_ctrl_limits(ctrl, &lim); + ret = queue_limits_commit_update(ctrl->admin_q, &lim); + if (ret) + goto out_free; + ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); ctrl->max_namespaces = le32_to_cpu(id->mnan); @@ -4357,6 +4363,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int cmd_size) { + struct queue_limits lim = {}; int ret; memset(set, 0, sizeof(*set)); @@ -4376,7 +4383,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ret) return ret; - ctrl->admin_q = blk_mq_alloc_queue(set, NULL, NULL); + ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL); if (IS_ERR(ctrl->admin_q)) { ret = PTR_ERR(ctrl->admin_q); goto out_free_tagset; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 01e8bae788658..27397f8404d65 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1038,8 +1038,9 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk) int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, + struct queue_limits *lim); #ifdef CONFIG_BLK_DEV_ZONED -int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd, enum nvme_zone_mgmt_action action); @@ -1050,13 +1051,6 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, { return BLK_STS_NOTSUPP; } - -static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) -{ - dev_warn(ns->ctrl->device, - "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); - return -EPROTONOSUPPORT; -} #endif static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 852261d789136..722384bcc765c 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -35,10 +35,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl) return 0; } -int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, + struct queue_limits *lim) { struct nvme_effects_log *log = ns->head->effects; - struct request_queue *q = ns->queue; struct nvme_command c = { }; struct nvme_id_ns_zns *id; int status; @@ -99,12 +99,12 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) goto free_data; } - disk_set_zoned(ns->disk); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1); - disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1); - blk_queue_chunk_sectors(ns->queue, ns->head->zsze); - blk_queue_max_zone_append_sectors(ns->queue, ns->ctrl->max_zone_append); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue); + lim->zoned = 1; + lim->max_open_zones = le32_to_cpu(id->mor) + 1; + lim->max_active_zones = le32_to_cpu(id->mar) + 1; + lim->chunk_sectors = ns->head->zsze; + lim->max_zone_append_sectors = ns->ctrl->max_zone_append; free_data: kfree(id); return status; From c5be5df7217fec219e1be063859e5d099b6a9227 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:04:59 -0700 Subject: [PATCH 24/31] nvme-multipath: pass queue_limits to blk_alloc_disk The multipath disk starts out with the stacking default limits. The one interesting part here is that blk_set_stacking_limits sets the max_zone_append_sectorts to UINT_MAX, which fails the validation for non-zoned devices. With the old one call per limit scheme this was fine because no one verified this weird mismatch and it was fixed by blk_stack_limits a little later before I/O could be issued. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index dc5d0d0a82d0e..5397fb428b242 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -516,6 +516,7 @@ static void nvme_requeue_work(struct work_struct *work) int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) { + struct queue_limits lim; bool vwc = false; mutex_init(&head->lock); @@ -532,7 +533,12 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) !nvme_is_unique_nsid(ctrl, head) || !multipath) return 0; - head->disk = blk_alloc_disk(NULL, ctrl->numa_node); + blk_set_stacking_limits(&lim); + lim.dma_alignment = 3; + if (head->ids.csi != NVME_CSI_ZNS) + lim.max_zone_append_sectors = 0; + + head->disk = blk_alloc_disk(&lim, ctrl->numa_node); if (IS_ERR(head->disk)) return PTR_ERR(head->disk); head->disk->fops = &nvme_ns_head_ops; @@ -553,11 +559,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); - /* set to a default value of 512 until the disk is validated */ - blk_queue_logical_block_size(head->disk->queue, 512); - blk_set_stacking_limits(&head->disk->queue->limits); - blk_queue_dma_alignment(head->disk->queue, 3); - /* we need to propagate up the VMC settings */ if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; From f7e0a545f7311691fbcabbb85238c8e4dd1a7c01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Mar 2024 07:05:00 -0700 Subject: [PATCH 25/31] nvme-multipath: use atomic queue limits API for stacking limits Switch to the queue_limits_* helpers to stack the bdev limits, which also includes updating the readahead settings. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6413ce24fb4b1..3f985b93a19f9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2188,6 +2188,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) } if (!ret && nvme_ns_head_multipath(ns->head)) { + struct queue_limits lim; + blk_mq_freeze_queue(ns->head->disk->queue); if (unsupported) ns->head->disk->flags |= GENHD_FL_HIDDEN; @@ -2196,10 +2198,11 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); nvme_mpath_revalidate_paths(ns); - blk_stack_limits(&ns->head->disk->queue->limits, - &ns->queue->limits, 0); - disk_update_readahead(ns->head->disk); + lim = queue_limits_start_update(ns->head->disk->queue); + queue_limits_stack_bdev(&lim, ns->disk->part0, 0, + ns->head->disk->disk_name); + ret = queue_limits_commit_update(ns->head->disk->queue, &lim); blk_mq_unfreeze_queue(ns->head->disk->queue); } From 5f5ea0e4916874098ee818f40156fc35dba2bcf9 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 21 Feb 2024 14:45:30 +0100 Subject: [PATCH 26/31] nvme-fabrics: typo in nvmf_parse_key() Of course we should use the key if there is no error ... Signed-off-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 3499acbf6a822..ab5ac219b70a8 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -637,7 +637,7 @@ static struct key *nvmf_parse_key(int key_id) } key = key_lookup(key_id); - if (!IS_ERR(key)) + if (IS_ERR(key)) pr_err("key id %08x not found\n", key_id); else pr_debug("Using key id %08x\n", key_id); From ab21f3d9098b0870a385c1cb6b0753c15aa9d429 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 10:15:56 -0300 Subject: [PATCH 27/31] nvme: core: constify struct class usage Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the structures nvme_class, nvme_subsys_class and nvme_ns_chr_class to be declared at build time placing them into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 53 +++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3f985b93a19f9..c4d928585ce35 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -114,12 +114,21 @@ static DEFINE_MUTEX(nvme_subsystems_lock); static DEFINE_IDA(nvme_instance_ida); static dev_t nvme_ctrl_base_chr_devt; -static struct class *nvme_class; -static struct class *nvme_subsys_class; +static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env); +static const struct class nvme_class = { + .name = "nvme", + .dev_uevent = nvme_class_uevent, +}; + +static const struct class nvme_subsys_class = { + .name = "nvme-subsystem", +}; static DEFINE_IDA(nvme_ns_chr_minor_ida); static dev_t nvme_ns_chr_devt; -static struct class *nvme_ns_chr_class; +static const struct class nvme_ns_chr_class = { + .name = "nvme-generic", +}; static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -2881,7 +2890,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys->awupf = le16_to_cpu(id->awupf); nvme_mpath_default_iopolicy(subsys); - subsys->dev.class = nvme_subsys_class; + subsys->dev.class = &nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; subsys->dev.groups = nvme_subsys_attrs_groups; dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); @@ -3435,7 +3444,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, if (minor < 0) return minor; cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); - cdev_device->class = nvme_ns_chr_class; + cdev_device->class = &nvme_ns_chr_class; cdev_device->release = nvme_cdev_rel; device_initialize(cdev_device); cdev_init(cdev, fops); @@ -4627,7 +4636,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->device = &ctrl->ctrl_device; ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), ctrl->instance); - ctrl->device->class = nvme_class; + ctrl->device->class = &nvme_class; ctrl->device->parent = ctrl->dev; if (ops->dev_attr_groups) ctrl->device->groups = ops->dev_attr_groups; @@ -4860,42 +4869,36 @@ static int __init nvme_core_init(void) if (result < 0) goto destroy_delete_wq; - nvme_class = class_create("nvme"); - if (IS_ERR(nvme_class)) { - result = PTR_ERR(nvme_class); + result = class_register(&nvme_class); + if (result) goto unregister_chrdev; - } - nvme_class->dev_uevent = nvme_class_uevent; - nvme_subsys_class = class_create("nvme-subsystem"); - if (IS_ERR(nvme_subsys_class)) { - result = PTR_ERR(nvme_subsys_class); + result = class_register(&nvme_subsys_class); + if (result) goto destroy_class; - } result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, "nvme-generic"); if (result < 0) goto destroy_subsys_class; - nvme_ns_chr_class = class_create("nvme-generic"); - if (IS_ERR(nvme_ns_chr_class)) { - result = PTR_ERR(nvme_ns_chr_class); + result = class_register(&nvme_ns_chr_class); + if (result) goto unregister_generic_ns; - } + result = nvme_init_auth(); if (result) goto destroy_ns_chr; return 0; destroy_ns_chr: - class_destroy(nvme_ns_chr_class); + class_unregister(&nvme_ns_chr_class); unregister_generic_ns: unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); destroy_subsys_class: - class_destroy(nvme_subsys_class); + class_unregister(&nvme_subsys_class); destroy_class: - class_destroy(nvme_class); + class_unregister(&nvme_class); unregister_chrdev: unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_delete_wq: @@ -4911,9 +4914,9 @@ static int __init nvme_core_init(void) static void __exit nvme_core_exit(void) { nvme_exit_auth(); - class_destroy(nvme_ns_chr_class); - class_destroy(nvme_subsys_class); - class_destroy(nvme_class); + class_unregister(&nvme_ns_chr_class); + class_unregister(&nvme_subsys_class); + class_unregister(&nvme_class); unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_workqueue(nvme_delete_wq); From 3c2bcfd5ac41fc379d2a7082f24d7de227b3f1e1 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 10:15:57 -0300 Subject: [PATCH 28/31] nvme: fabrics: make nvmf_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the nvmf_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index ab5ac219b70a8..0141c0a6942ff 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -1318,7 +1318,10 @@ nvmf_create_ctrl(struct device *dev, const char *buf) return ERR_PTR(ret); } -static struct class *nvmf_class; +static const struct class nvmf_class = { + .name = "nvme-fabrics", +}; + static struct device *nvmf_device; static DEFINE_MUTEX(nvmf_dev_mutex); @@ -1438,15 +1441,14 @@ static int __init nvmf_init(void) if (!nvmf_default_host) return -ENOMEM; - nvmf_class = class_create("nvme-fabrics"); - if (IS_ERR(nvmf_class)) { + ret = class_register(&nvmf_class); + if (ret) { pr_err("couldn't register class nvme-fabrics\n"); - ret = PTR_ERR(nvmf_class); goto out_free_host; } nvmf_device = - device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); + device_create(&nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); if (IS_ERR(nvmf_device)) { pr_err("couldn't create nvme-fabrics device!\n"); ret = PTR_ERR(nvmf_device); @@ -1462,9 +1464,9 @@ static int __init nvmf_init(void) return 0; out_destroy_device: - device_destroy(nvmf_class, MKDEV(0, 0)); + device_destroy(&nvmf_class, MKDEV(0, 0)); out_destroy_class: - class_destroy(nvmf_class); + class_unregister(&nvmf_class); out_free_host: nvmf_host_put(nvmf_default_host); return ret; @@ -1473,8 +1475,8 @@ static int __init nvmf_init(void) static void __exit nvmf_exit(void) { misc_deregister(&nvmf_misc); - device_destroy(nvmf_class, MKDEV(0, 0)); - class_destroy(nvmf_class); + device_destroy(&nvmf_class, MKDEV(0, 0)); + class_unregister(&nvmf_class); nvmf_host_put(nvmf_default_host); BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64); From 800bb2b02fb81cc408adf7108d91087ad33d5aa9 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 10:15:58 -0300 Subject: [PATCH 29/31] nvme: fcloop: make fcloop_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the fcloop_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/fcloop.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1471af250ea62..913cd2ec7a6f6 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1556,7 +1556,9 @@ static const struct attribute_group *fcloop_dev_attr_groups[] = { NULL, }; -static struct class *fcloop_class; +static const struct class fcloop_class = { + .name = "fcloop", +}; static struct device *fcloop_device; @@ -1564,15 +1566,14 @@ static int __init fcloop_init(void) { int ret; - fcloop_class = class_create("fcloop"); - if (IS_ERR(fcloop_class)) { + ret = class_register(&fcloop_class); + if (ret) { pr_err("couldn't register class fcloop\n"); - ret = PTR_ERR(fcloop_class); return ret; } fcloop_device = device_create_with_groups( - fcloop_class, NULL, MKDEV(0, 0), NULL, + &fcloop_class, NULL, MKDEV(0, 0), NULL, fcloop_dev_attr_groups, "ctl"); if (IS_ERR(fcloop_device)) { pr_err("couldn't create ctl device!\n"); @@ -1585,7 +1586,7 @@ static int __init fcloop_init(void) return 0; out_destroy_class: - class_destroy(fcloop_class); + class_unregister(&fcloop_class); return ret; } @@ -1643,8 +1644,8 @@ static void __exit fcloop_exit(void) put_device(fcloop_device); - device_destroy(fcloop_class, MKDEV(0, 0)); - class_destroy(fcloop_class); + device_destroy(&fcloop_class, MKDEV(0, 0)); + class_unregister(&fcloop_class); } module_init(fcloop_init); From 8d0d2447394b13fb22a069f0330f9c49b7fff9d3 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Wed, 6 Mar 2024 15:03:03 +0900 Subject: [PATCH 30/31] nvme: host: fix double-free of struct nvme_id_ns in ns_update_nuse() When nvme_identify_ns() fails, it frees the pointer to the struct nvme_id_ns before it returns. However, ns_update_nuse() calls kfree() for the pointer even when nvme_identify_ns() fails. This results in KASAN double-free, which was observed with blktests nvme/045 with proposed patches [1] on the kernel v6.8-rc7. Fix the double-free by skipping kfree() when nvme_identify_ns() fails. Link: https://lore.kernel.org/linux-block/20240304161303.19681-1-dwagner@suse.de/ [1] Fixes: a1a825ab6a60 ("nvme: add csi, ms and nuse to sysfs") Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Christoph Hellwig Reviewed-by: Daniel Wagner Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index d099218e494a8..6c7f1d5c056fc 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -221,14 +221,11 @@ static int ns_update_nuse(struct nvme_ns *ns) ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id); if (ret) - goto out_free_id; + return ret; ns->head->nuse = le64_to_cpu(id->nuse); - -out_free_id: kfree(id); - - return ret; + return 0; } static ssize_t nuse_show(struct device *dev, struct device_attribute *attr, From 7e80eb792bd7377a20f204943ac31c77d859be89 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 6 Mar 2024 06:20:30 -0800 Subject: [PATCH 31/31] nvme: clear caller pointer on identify failure The memory allocated for the identification is freed on failure. Set it to NULL so the caller doesn't have a pointer to that freed address. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c4d928585ce35..2baf5786a92fe 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1403,8 +1403,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, sizeof(struct nvme_id_ctrl)); - if (error) + if (error) { kfree(*id); + *id = NULL; + } return error; } @@ -1533,6 +1535,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); kfree(*id); + *id = NULL; } return error; }