From 03d99e5d63dabe2c0cea0d8fe1cb89bde33f7939 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 16 Oct 2020 14:28:38 -0700 Subject: [PATCH 01/23] nvme-fcloop: add sysfs attribute to inject command drop Add sysfs attribute to specify parameters for dropping a command. The attribute takes a string of: :: Opcode is formatted as lower 8 bits are opcode. If a fabrics opcode, a bit above bits 7:0 will be set. Once set, each sqe is looked at. If the opcode matches the running instance count is updated. If the instance count is in the range of where to drop (based on starting and # of times), then drop the command by not passing it to the target layer. Signed-off-by: James Smart --- drivers/nvme/target/fcloop.c | 81 +++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 3da067a8311e5..733d9363900e4 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -564,6 +564,50 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq, fcloop_tfcp_req_put(tfcp_req); } +static bool drop_fabric_opcode; +#define DROP_OPCODE_MASK 0x00FF +/* fabrics opcode will have a bit set above 1st byte */ +static int drop_opcode = -1; +static int drop_instance; +static int drop_amount; +static int drop_current_cnt; + +/* + * Routine to parse io and determine if the io is to be dropped. + * Returns: + * 0 if io is not obstructed + * 1 if io was dropped + */ +static int check_for_drop(struct fcloop_fcpreq *tfcp_req) +{ + struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq; + struct nvme_fc_cmd_iu *cmdiu = fcpreq->cmdaddr; + struct nvme_command *sqe = &cmdiu->sqe; + + if (drop_opcode == -1) + return 0; + + pr_info("%s: seq opcd x%02x fctype x%02x: drop F %s op x%02x " + "inst %d start %d amt %d\n", + __func__, sqe->common.opcode, sqe->fabrics.fctype, + drop_fabric_opcode ? "y" : "n", + drop_opcode, drop_current_cnt, drop_instance, drop_amount); + + if ((drop_fabric_opcode && + (sqe->common.opcode != nvme_fabrics_command || + sqe->fabrics.fctype != drop_opcode)) || + (!drop_fabric_opcode && sqe->common.opcode != drop_opcode)) + return 0; + + if (++drop_current_cnt >= drop_instance) { + if (drop_current_cnt >= drop_instance + drop_amount) + drop_opcode = -1; + return 1; + } + + return 0; +} + static void fcloop_fcp_recv_work(struct work_struct *work) { @@ -590,10 +634,14 @@ fcloop_fcp_recv_work(struct work_struct *work) if (unlikely(aborted)) ret = -ECANCELED; - else - ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, + else { + if (likely(!check_for_drop(tfcp_req))) + ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, &tfcp_req->tgt_fcp_req, fcpreq->cmdaddr, fcpreq->cmdlen); + else + pr_info("%s: dropped command ********\n", __func__); + } if (ret) fcloop_call_host_done(fcpreq, tfcp_req, ret); @@ -1449,6 +1497,33 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, return ret ? ret : count; } +static ssize_t +fcloop_set_cmd_drop(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int opcode, starting, amount; + + if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3) + return -EBADRQC; + + drop_current_cnt = 0; + drop_fabric_opcode = (opcode & ~DROP_OPCODE_MASK) ? true : false; + drop_opcode = (opcode & DROP_OPCODE_MASK); + drop_instance = starting; + /* the check to drop routine uses instance + count to know when + * to end. Thus, if dropping 1 instance, count should be 0. + * so subtract 1 from the count. + */ + drop_amount = amount - 1; + + pr_info("%s: DROP: Starting at instance %d of%s opcode x%x drop +%d " + "instances\n", + __func__, drop_instance, drop_fabric_opcode ? " fabric" : "", + drop_opcode, drop_amount); + + return count; +} + static DEVICE_ATTR(add_local_port, 0200, NULL, fcloop_create_local_port); static DEVICE_ATTR(del_local_port, 0200, NULL, fcloop_delete_local_port); @@ -1456,6 +1531,7 @@ static DEVICE_ATTR(add_remote_port, 0200, NULL, fcloop_create_remote_port); static DEVICE_ATTR(del_remote_port, 0200, NULL, fcloop_delete_remote_port); static DEVICE_ATTR(add_target_port, 0200, NULL, fcloop_create_target_port); static DEVICE_ATTR(del_target_port, 0200, NULL, fcloop_delete_target_port); +static DEVICE_ATTR(set_cmd_drop, 0200, NULL, fcloop_set_cmd_drop); static struct attribute *fcloop_dev_attrs[] = { &dev_attr_add_local_port.attr, @@ -1464,6 +1540,7 @@ static struct attribute *fcloop_dev_attrs[] = { &dev_attr_del_remote_port.attr, &dev_attr_add_target_port.attr, &dev_attr_del_target_port.attr, + &dev_attr_set_cmd_drop.attr, NULL }; From 84115d6d80c809d65c42f9383f22c10b91a4eb1c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 27 Oct 2020 16:15:16 +0800 Subject: [PATCH 02/23] nvme: simplify nvme_req_qid() Use the request's '->mq_hctx->queue_num' directly to simplify the nvme_req_qid() function. Signed-off-by: Baolin Wang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bc330bf0d3bde..87867e93c7d30 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -178,7 +178,8 @@ static inline u16 nvme_req_qid(struct request *req) { if (!req->q->queuedata) return 0; - return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; + + return req->mq_hctx->queue_num + 1; } /* The below value is the specific amount of delay needed before checking From 0d2e7c840b178bf9a47bd0de89d8f9182fa71d86 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:42 -0800 Subject: [PATCH 03/23] nvme: centralize setting the timeout in nvme_alloc_request The function nvme_alloc_request() is called from different context (I/O and Admin queue) where callers do not consider the I/O timeout when called from I/O queue context. Update nvme_alloc_request() to set the default I/O and Admin timeout value based on whether the queuedata is set or not. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 11 +++++++++-- drivers/nvme/host/lightnvm.c | 3 ++- drivers/nvme/host/pci.c | 2 -- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9b01afcb7777b..97348b1ecfd63 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -533,6 +533,11 @@ struct request *nvme_alloc_request(struct request_queue *q, if (IS_ERR(req)) return req; + if (req->q->queuedata) + req->timeout = NVME_IO_TIMEOUT; + else /* no queuedata implies admin queue */ + req->timeout = ADMIN_TIMEOUT; + req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); nvme_req(req)->cmd = cmd; @@ -901,7 +906,8 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, if (IS_ERR(req)) return PTR_ERR(req); - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (timeout) + req->timeout = timeout; if (buffer && bufflen) { ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); @@ -1071,7 +1077,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (IS_ERR(req)) return PTR_ERR(req); - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (timeout) + req->timeout = timeout; nvme_req(req)->flags |= NVME_REQ_USERCMD; if (ubuffer && bufflen) { diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 8e562d0f2c301..88a7c8eac4556 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -774,7 +774,8 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, goto err_cmd; } - rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (timeout) + rq->timeout = timeout; if (ppa_buf && ppa_len) { ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0578ff253c477..76465d3359248 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1310,7 +1310,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_RESET_TIMER; } - abort_req->timeout = ADMIN_TIMEOUT; abort_req->end_io_data = NULL; blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); @@ -2223,7 +2222,6 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) if (IS_ERR(req)) return PTR_ERR(req); - req->timeout = ADMIN_TIMEOUT; req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); From dc96f93874c63e126087e1adf1973c9fecfdaa0c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:45 -0800 Subject: [PATCH 04/23] nvme: use consistent macro name for timeout This is purely a clenaup patch, add prefix NVME to the ADMIN_TIMEOUT to make consistent with NVME_IO_TIMEOUT. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++--- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- drivers/nvme/target/loop.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 97348b1ecfd63..98bea150e5dc0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -536,7 +536,7 @@ struct request *nvme_alloc_request(struct request_queue *q, if (req->q->queuedata) req->timeout = NVME_IO_TIMEOUT; else /* no queuedata implies admin queue */ - req->timeout = ADMIN_TIMEOUT; + req->timeout = NVME_ADMIN_TIMEOUT; req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); @@ -2268,8 +2268,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); cmd.common.cdw11 = cpu_to_le32(len); - return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, - ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false); + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, + NVME_QID_ANY, 1, 0, false); } EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f4c246462658f..38373a0e86efb 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3479,7 +3479,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->lport->ops->fcprqst_priv_sz); ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 87867e93c7d30..824776a8ba13e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -24,7 +24,7 @@ extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) extern unsigned int admin_timeout; -#define ADMIN_TIMEOUT (admin_timeout * HZ) +#define NVME_ADMIN_TIMEOUT (admin_timeout * HZ) #define NVME_DEFAULT_KATO 5 #define NVME_KATO_GRACE 10 diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 76465d3359248..6123040ff8720 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1606,7 +1606,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.nr_hw_queues = 1; dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; - dev->admin_tagset.timeout = ADMIN_TIMEOUT; + dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev->ctrl.numa_node; dev->admin_tagset.cmd_size = sizeof(struct nvme_iod); dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; @@ -2237,7 +2237,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) unsigned long timeout; retry: - timeout = ADMIN_TIMEOUT; + timeout = NVME_ADMIN_TIMEOUT; while (nr_queues > 0) { if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) break; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 65e3d0ef36e1a..df9f6f4549f16 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -797,7 +797,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, NVME_RDMA_DATA_SGL_SIZE; set->driver_data = ctrl; set->nr_hw_queues = 1; - set->timeout = ADMIN_TIMEOUT; + set->timeout = NVME_ADMIN_TIMEOUT; set->flags = BLK_MQ_F_NO_SCHED; } else { set = &ctrl->tag_set; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c0c33320fe659..1ba6599274427 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1568,7 +1568,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->cmd_size = sizeof(struct nvme_tcp_request); set->driver_data = ctrl; set->nr_hw_queues = 1; - set->timeout = ADMIN_TIMEOUT; + set->timeout = NVME_ADMIN_TIMEOUT; } else { set = &ctrl->tag_set; memset(set, 0, sizeof(*set)); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f6d81239be215..76d8c0a9a87d6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -345,7 +345,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) NVME_INLINE_SG_CNT * sizeof(struct scatterlist); ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ctrl->queues[0].ctrl = ctrl; From a2f6a2b8ce43db608357a490e028166f9e4bab0d Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:43 -0800 Subject: [PATCH 05/23] nvmet: add passthru admin timeout value attr NVMeOF controller in the passsthru mode is capable of handling wide set of admin commands including vender specific passhtru admin comands. The vendor specific admin commands are used to read the large drive logs and can take longer than default NVMe commands, i.e. for passthru requests the timeout value may differ from the passthru controller's default timeout values (nvme-core:admin_timeout). Add a configfs attribute so that user can set the admin timeout values. In case if this configfs value is not set nvme_alloc_request() will set the ADMIN_TIMEOUT value when request queuedata is NULL. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 20 ++++++++++++++++++++ drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/passthru.c | 6 ++++++ 3 files changed, 27 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 37e1d7784e175..781157a654e95 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -736,9 +736,29 @@ static ssize_t nvmet_passthru_enable_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_passthru_, enable); +static ssize_t nvmet_passthru_admin_timeout_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%u\n", to_subsys(item->ci_parent)->admin_timeout); +} + +static ssize_t nvmet_passthru_admin_timeout_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + unsigned int timeout; + + if (kstrtouint(page, 0, &timeout)) + return -EINVAL; + subsys->admin_timeout = timeout; + return count; +} +CONFIGFS_ATTR(nvmet_passthru_, admin_timeout); + static struct configfs_attribute *nvmet_passthru_attrs[] = { &nvmet_passthru_attr_device_path, &nvmet_passthru_attr_enable, + &nvmet_passthru_attr_admin_timeout, NULL, }; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 559a15ccc322c..a0c80e5179a27 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -249,6 +249,7 @@ struct nvmet_subsys { struct nvme_ctrl *passthru_ctrl; char *passthru_ctrl_path; struct config_group passthru_group; + unsigned int admin_timeout; #endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 8ee94f0568983..b496682ccf859 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -227,6 +227,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) struct request_queue *q = ctrl->admin_q; struct nvme_ns *ns = NULL; struct request *rq = NULL; + unsigned int timeout = 0; u32 effects; u16 status; int ret; @@ -242,6 +243,8 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) } q = ns->queue; + } else { + timeout = req->sq->ctrl->subsys->admin_timeout; } rq = nvme_alloc_request(q, req->cmd, 0, NVME_QID_ANY); @@ -250,6 +253,9 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) goto out_put_ns; } + if (timeout) + rq->timeout = timeout; + if (req->sg_cnt) { ret = nvmet_passthru_map_sg(req, rq); if (unlikely(ret)) { From 47e9730c26a4a5d4eab2124d6bbeb94693e44b46 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:44 -0800 Subject: [PATCH 06/23] nvmet: add passthru io timeout value attr NVMeOF controller in the passsthru mode is capable of handling wide set of I/O commands including vender specific passhtru io comands. The vendor specific I/O commands are used to read the large drive logs and can take longer than default NVMe commands, i.e. for passthru requests the timeout value may differ from the passthru controller's default timeout values (nvme-core:io_timeout). Add a configfs attribute so that user can set the io timeout values. In case if this configfs value is not set nvme_alloc_request() will set the NVME_IO_TIMEOUT value when request queuedata is NULL. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 20 ++++++++++++++++++++ drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/passthru.c | 3 ++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 781157a654e95..c61ffd7670626 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -755,10 +755,30 @@ static ssize_t nvmet_passthru_admin_timeout_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_passthru_, admin_timeout); +static ssize_t nvmet_passthru_io_timeout_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%u\n", to_subsys(item->ci_parent)->io_timeout); +} + +static ssize_t nvmet_passthru_io_timeout_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + unsigned int timeout; + + if (kstrtouint(page, 0, &timeout)) + return -EINVAL; + subsys->io_timeout = timeout; + return count; +} +CONFIGFS_ATTR(nvmet_passthru_, io_timeout); + static struct configfs_attribute *nvmet_passthru_attrs[] = { &nvmet_passthru_attr_device_path, &nvmet_passthru_attr_enable, &nvmet_passthru_attr_admin_timeout, + &nvmet_passthru_attr_io_timeout, NULL, }; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index a0c80e5179a27..2f96352736292 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -250,6 +250,7 @@ struct nvmet_subsys { char *passthru_ctrl_path; struct config_group passthru_group; unsigned int admin_timeout; + unsigned int io_timeout; #endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index b496682ccf859..a062398305a76 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -227,7 +227,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) struct request_queue *q = ctrl->admin_q; struct nvme_ns *ns = NULL; struct request *rq = NULL; - unsigned int timeout = 0; + unsigned int timeout; u32 effects; u16 status; int ret; @@ -243,6 +243,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) } q = ns->queue; + timeout = req->sq->ctrl->subsys->io_timeout; } else { timeout = req->sq->ctrl->subsys->admin_timeout; } From 53ffabfd4ddb3a24c5603ae82eefb5537ecb5c20 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:03 -0800 Subject: [PATCH 07/23] block: move blk_rq_bio_prep() to linux/blk-mq.h This is a preparation patch to have minimal block layer request bio append functionality in the context of the NVMeOF Passthru driver which falls in the fast path and doesn't need calls from blk_rq_append_bio(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- block/blk.h | 12 ------------ include/linux/blk-mq.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/block/blk.h b/block/blk.h index dfab98465db9a..e05507a8d1e3d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -91,18 +91,6 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } -static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, - unsigned int nr_segs) -{ - rq->nr_phys_segments = nr_segs; - rq->__data_len = bio->bi_iter.bi_size; - rq->bio = rq->biotail = bio; - rq->ioprio = bio_prio(bio); - - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; -} - #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 794b2a33a2c36..e7482e6ad3ec6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -593,6 +593,18 @@ static inline void blk_mq_cleanup_rq(struct request *rq) rq->q->mq_ops->cleanup_rq(rq); } +static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, + unsigned int nr_segs) +{ + rq->nr_phys_segments = nr_segs; + rq->__data_len = bio->bi_iter.bi_size; + rq->bio = rq->biotail = bio; + rq->ioprio = bio_prio(bio); + + if (bio->bi_disk) + rq->rq_disk = bio->bi_disk; +} + blk_qc_t blk_mq_submit_bio(struct bio *bio); #endif From 39dfe84451b4526a8054cc5a127337bca980dfa3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:00 -0800 Subject: [PATCH 08/23] nvme: split nvme_alloc_request() Right now nvme_alloc_request() allocates a request from block layer based on the value of the qid. When qid set to NVME_QID_ANY it used blk_mq_alloc_request() else blk_mq_alloc_request_hctx(). The function nvme_alloc_request() is called from different context, The only place where it uses non NVME_QID_ANY value is for fabrics connect commands :- nvme_submit_sync_cmd() NVME_QID_ANY nvme_features() NVME_QID_ANY nvme_sec_submit() NVME_QID_ANY nvmf_reg_read32() NVME_QID_ANY nvmf_reg_read64() NVME_QID_ANY nvmf_reg_write32() NVME_QID_ANY nvmf_connect_admin_queue() NVME_QID_ANY nvme_submit_user_cmd() NVME_QID_ANY nvme_alloc_request() nvme_keep_alive() NVME_QID_ANY nvme_alloc_request() nvme_timeout() NVME_QID_ANY nvme_alloc_request() nvme_delete_queue() NVME_QID_ANY nvme_alloc_request() nvmet_passthru_execute_cmd() NVME_QID_ANY nvme_alloc_request() nvmf_connect_io_queue() QID __nvme_submit_sync_cmd() nvme_alloc_request() With passthru nvme_alloc_request() now falls into the I/O fast path such that blk_mq_alloc_request_hctx() is never gets called and that adds additional branch check in fast path. Split the nvme_alloc_request() into nvme_alloc_request() and nvme_alloc_request_qid(). Replace each call of the nvme_alloc_request() with NVME_QID_ANY param with a call to newly added nvme_alloc_request() without NVME_QID_ANY. Replace a call to nvme_alloc_request() with QID param with a call to newly added nvme_alloc_request() and nvme_alloc_request_qid() based on the qid value set in the __nvme_submit_sync_cmd(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 52 +++++++++++++++++++++++----------- drivers/nvme/host/lightnvm.c | 5 ++-- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/pci.c | 4 +-- drivers/nvme/target/passthru.c | 2 +- 5 files changed, 42 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 98bea150e5dc0..fff90200497c8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -518,21 +518,14 @@ static inline void nvme_clear_nvme_request(struct request *req) } } -struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) +static inline unsigned int nvme_req_op(struct nvme_command *cmd) { - unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; - struct request *req; - - if (qid == NVME_QID_ANY) { - req = blk_mq_alloc_request(q, op, flags); - } else { - req = blk_mq_alloc_request_hctx(q, op, flags, - qid ? qid - 1 : 0); - } - if (IS_ERR(req)) - return req; + return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; +} +static inline void nvme_init_request(struct request *req, + struct nvme_command *cmd) +{ if (req->q->queuedata) req->timeout = NVME_IO_TIMEOUT; else /* no queuedata implies admin queue */ @@ -541,11 +534,33 @@ struct request *nvme_alloc_request(struct request_queue *q, req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); nvme_req(req)->cmd = cmd; +} +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags) +{ + struct request *req; + + req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); + if (!IS_ERR(req)) + nvme_init_request(req, cmd); return req; } EXPORT_SYMBOL_GPL(nvme_alloc_request); +struct request *nvme_alloc_request_qid(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) +{ + struct request *req; + + req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, + qid ? qid - 1 : 0); + if (!IS_ERR(req)) + nvme_init_request(req, cmd); + return req; +} +EXPORT_SYMBOL_GPL(nvme_alloc_request_qid); + static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) { struct nvme_command c; @@ -902,7 +917,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, struct request *req; int ret; - req = nvme_alloc_request(q, cmd, flags, qid); + if (qid == NVME_QID_ANY) + req = nvme_alloc_request(q, cmd, flags); + else + req = nvme_alloc_request_qid(q, cmd, flags, qid); if (IS_ERR(req)) return PTR_ERR(req); @@ -1073,7 +1091,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, void *meta = NULL; int ret; - req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); + req = nvme_alloc_request(q, cmd, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -1148,8 +1166,8 @@ static int nvme_keep_alive(struct nvme_ctrl *ctrl) { struct request *rq; - rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED, - NVME_QID_ANY); + rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, + BLK_MQ_REQ_RESERVED); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 88a7c8eac4556..470cef3abec3d 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -653,7 +653,7 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, nvme_nvm_rqtocmd(rqd, ns, cmd); - rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); + rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0); if (IS_ERR(rq)) return rq; @@ -767,8 +767,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, DECLARE_COMPLETION_ONSTACK(wait); int ret = 0; - rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0, - NVME_QID_ANY); + rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0); if (IS_ERR(rq)) { ret = -ENOMEM; goto err_cmd; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 824776a8ba13e..83fb30e317e07 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -611,6 +611,8 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags); +struct request *nvme_alloc_request_qid(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); void nvme_cleanup_cmd(struct request *req); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6123040ff8720..5e6365dd0c8e9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1304,7 +1304,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) req->tag, nvmeq->qid); abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, - BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + BLK_MQ_REQ_NOWAIT); if (IS_ERR(abort_req)) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; @@ -2218,7 +2218,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) cmd.delete_queue.opcode = opcode; cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); - req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index a062398305a76..be8ae59dcb710 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -248,7 +248,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) timeout = req->sq->ctrl->subsys->admin_timeout; } - rq = nvme_alloc_request(q, req->cmd, 0, NVME_QID_ANY); + rq = nvme_alloc_request(q, req->cmd, 0); if (IS_ERR(rq)) { status = NVME_SC_INTERNAL; goto out_put_ns; From 06b3bec8204b4c6433ccb2f6ec60fedb77b34cb3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:02 -0800 Subject: [PATCH 09/23] nvmet: remove op_flags for passthru commands For passthru commands setting op_flags has no meaning. Remove the code that sets the op flags in nvmet_passthru_map_sg(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index be8ae59dcb710..1c84dadfb38f2 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -188,21 +188,15 @@ static void nvmet_passthru_req_done(struct request *rq, static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) { struct scatterlist *sg; - int op_flags = 0; struct bio *bio; int i, ret; if (req->sg_cnt > BIO_MAX_PAGES) return -EINVAL; - if (req->cmd->common.opcode == nvme_cmd_flush) - op_flags = REQ_FUA; - else if (nvme_is_write(req->cmd)) - op_flags = REQ_SYNC | REQ_IDLE; - bio = bio_alloc(GFP_KERNEL, req->sg_cnt); bio->bi_end_io = bio_put; - bio->bi_opf = req_op(rq) | op_flags; + bio->bi_opf = req_op(rq); for_each_sg(req->sg, sg, req->sg_cnt, i) { if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, From a4fe2d3afe3ce77edeadb567c0d0a8d102c6b159 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:04 -0800 Subject: [PATCH 10/23] nvmet: use blk_rq_bio_prep instead of blk_rq_append_bio The function blk_rq_append_bio() is a genereric API written for all types driver (having bounce buffers) and different context (where request is already having a bio i.e. rq->bio != NULL). It does mainly three things: calculating the segments, bounce queue and if req->bio == NULL call blk_rq_bio_prep() or handle low level merge() case. The NVMe PCIe and fabrics transports currently does not use queue bounce mechanism. In order to find this for each request processing in the passthru blk_rq_append_bio() does extra work in the fast path for each request. When I ran I/Os with different block sizes on the passthru controller I found that we can reuse the req->sg_cnt instead of iterating over the bvecs to find out nr_segs in blk_rq_append_bio(). This calculation in blk_rq_append_bio() is a duplication of work given that we have the value in req->sg_cnt. (correct me here if I'm wrong). With NVMe passthru request based driver we allocate fresh request each time, so every call to blk_rq_append_bio() rq->bio will be NULL i.e. we don't really need the second condition in the blk_rq_append_bio() and the resulting error condition in the caller of blk_rq_append_bio(). So for NVMeOF passthru driver recalculating the segments, bounce check and ll_back_merge code is not needed such that we can get away with the minimal version of the blk_rq_append_bio() which removes the error check in the fast path along with extra variable in nvmet_passthru_map_sg(). This patch updates the nvmet_passthru_map_sg() such that it does only appending the bio to the request in the context of the NVMeOF Passthru driver. Following are perf numbers :- With current implementation (blk_rq_append_bio()) :- ---------------------------------------------------- + 5.80% 0.02% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.44% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 4.88% 0.00% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.44% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 4.86% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.17% 0.00% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd With this patch using blk_rq_bio_prep() :- ---------------------------------------------------- + 3.14% 0.02% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd + 3.26% 0.01% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd + 5.37% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.18% 0.02% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd + 4.84% 0.02% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 4.87% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 1c84dadfb38f2..2b24205ee79d8 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -189,7 +189,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) { struct scatterlist *sg; struct bio *bio; - int i, ret; + int i; if (req->sg_cnt > BIO_MAX_PAGES) return -EINVAL; @@ -206,11 +206,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) } } - ret = blk_rq_append_bio(rq, &bio); - if (unlikely(ret)) { - bio_put(bio); - return ret; - } + blk_rq_bio_prep(rq, bio, req->sg_cnt); return 0; } From dab3902b19a0dd1668d0cc3e8e4b976b1ee8638c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:05 -0800 Subject: [PATCH 11/23] nvmet: use inline bio for passthru fast path In nvmet_passthru_execute_cmd() which is a high frequency function it uses bio_alloc() which leads to memory allocation from the fs pool for each I/O. For NVMeoF nvmet_req we already have inline_bvec allocated as a part of request allocation that can be used with preallocated bio when we already know the size of request before bio allocation with bio_alloc(), which we already do. Introduce a bio member for the nvmet_req passthru anon union. In the fast path, check if we can get away with inline bvec and bio from nvmet_req with bio_init() call before actually allocating from the bio_alloc(). This will be useful to avoid any new memory allocation under high memory pressure situation and get rid of any extra work of allocation (bio_alloc()) vs initialization (bio_init()) when transfer len is < NVMET_MAX_INLINE_DATA_LEN that user can configure at compile time. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/passthru.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 2f96352736292..e89ec280e91a1 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -332,6 +332,7 @@ struct nvmet_req { struct work_struct work; } f; struct { + struct bio inline_bio; struct request *rq; struct work_struct work; bool use_workqueue; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 2b24205ee79d8..b9776fc8f08f4 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -194,14 +194,20 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) if (req->sg_cnt > BIO_MAX_PAGES) return -EINVAL; - bio = bio_alloc(GFP_KERNEL, req->sg_cnt); - bio->bi_end_io = bio_put; + if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { + bio = &req->p.inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + } else { + bio = bio_alloc(GFP_KERNEL, min(req->sg_cnt, BIO_MAX_PAGES)); + bio->bi_end_io = bio_put; + } bio->bi_opf = req_op(rq); for_each_sg(req->sg, sg, req->sg_cnt, i) { if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, sg->offset) < sg->length) { - bio_put(bio); + if (bio != &req->p.inline_bio) + bio_put(bio); return -EINVAL; } } From ff4e5fbad06f762b8551da56e8fd64ad14c8aa3e Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 12 Nov 2020 09:23:01 +0100 Subject: [PATCH 12/23] nvme-pci: drop min() from nr_io_queues assignment in nvme_setup_io_queues() the number of I/O queues is set to either 1 in case of a quirky Apple device or to the min of nvme_max_io_queues() or dev->nr_allocated_queues - 1. This is unnecessarily complicated as dev->nr_allocated_queues is only assigned once and is nvme_max_io_queues() + 1. Signed-off-by: Niklas Schnelle Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5e6365dd0c8e9..90b3384350213 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2113,8 +2113,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) nr_io_queues = 1; else - nr_io_queues = min(nvme_max_io_queues(dev), - dev->nr_allocated_queues - 1); + nr_io_queues = dev->nr_allocated_queues - 1; result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) From e3aef0950a30ecbf475be52509ca178907410709 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 12 Nov 2020 09:23:02 +0100 Subject: [PATCH 13/23] nvme-pci: don't allocate unused I/O queues currently the NVME_QUIRK_SHARED_TAGS quirk for Apple devices is handled during the assignment of nr_io_queues in nvme_setup_io_queues(). This however means that for these devices nvme_max_io_queues() will actually not return the supported maximum which is confusing and unexpected and also means that in nvme_probe() we are allocating for I/O queues that will never be used. Fix this by moving the quirk handling into nvme_max_io_queues(). Signed-off-by: Niklas Schnelle Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 90b3384350213..2c072f33a577f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2088,6 +2088,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) static unsigned int nvme_max_io_queues(struct nvme_dev *dev) { + /* + * If tags are shared with admin queue (Apple bug), then + * make sure we only use one IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + return 1; return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; } @@ -2106,15 +2112,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->nr_write_queues = write_queues; dev->nr_poll_queues = poll_queues; - /* - * If tags are shared with admin queue (Apple bug), then - * make sure we only use one IO queue. - */ - if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) - nr_io_queues = 1; - else - nr_io_queues = dev->nr_allocated_queues - 1; - + nr_io_queues = dev->nr_allocated_queues - 1; result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; From 6d65aeab7bf6e83e75f53cfdbdb84603e52e1182 Mon Sep 17 00:00:00 2001 From: Amit Date: Sun, 15 Nov 2020 14:19:51 +0200 Subject: [PATCH 14/23] nvmet: remove unused ctrl->cqs remove unused cqs from nvmet_ctrl struct this will reduce the allocated memory. Signed-off-by: Amit Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 15 ++------------- drivers/nvme/target/nvmet.h | 1 - 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 957b39a82431b..8ce4d59cc9e75 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -757,8 +757,6 @@ void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, { cq->qid = qid; cq->size = size; - - ctrl->cqs[qid] = cq; } void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, @@ -1344,20 +1342,14 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (!ctrl->changed_ns_list) goto out_free_ctrl; - ctrl->cqs = kcalloc(subsys->max_qid + 1, - sizeof(struct nvmet_cq *), - GFP_KERNEL); - if (!ctrl->cqs) - goto out_free_changed_ns_list; - ctrl->sqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_sq *), GFP_KERNEL); if (!ctrl->sqs) - goto out_free_cqs; + goto out_free_changed_ns_list; if (subsys->cntlid_min > subsys->cntlid_max) - goto out_free_cqs; + goto out_free_changed_ns_list; ret = ida_simple_get(&cntlid_ida, subsys->cntlid_min, subsys->cntlid_max, @@ -1395,8 +1387,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, out_free_sqs: kfree(ctrl->sqs); -out_free_cqs: - kfree(ctrl->cqs); out_free_changed_ns_list: kfree(ctrl->changed_ns_list); out_free_ctrl: @@ -1426,7 +1416,6 @@ static void nvmet_ctrl_free(struct kref *ref) nvmet_async_events_free(ctrl); kfree(ctrl->sqs); - kfree(ctrl->cqs); kfree(ctrl->changed_ns_list); kfree(ctrl); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e89ec280e91a1..592763732065b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -164,7 +164,6 @@ static inline struct nvmet_port *ana_groups_to_port( struct nvmet_ctrl { struct nvmet_subsys *subsys; - struct nvmet_cq **cqs; struct nvmet_sq **sqs; bool cmd_seen; From 0068a7b010533872b6e71a376771dc310d90fa1c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 25 Nov 2020 12:27:36 +0000 Subject: [PATCH 15/23] nvmet: make sure discovery change log event is protected Generation counter is protected by nvmet_config_sem. Make sure the callers that call functions that might change it, are calling it properly. Signed-off-by: Max Gurtovoy Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/discovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index f40c05c33c3ab..682854e0e079d 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -69,6 +69,7 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, struct nvmet_port *port; struct nvmet_subsys_link *s; + lockdep_assert_held(&nvmet_config_sem); nvmet_genctr++; list_for_each_entry(port, nvmet_ports, global_entry) From 9f20599c4821d1f7281a3efb3ef94ff3cfdd5e10 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 26 Nov 2020 22:40:29 +0000 Subject: [PATCH 16/23] nvmet: fix a spelling mistake "incuding" -> "including" in Kconfig There is a spelling mistake in the Kconfig help text. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 8056955e652c6..4be2ececbc45b 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -24,7 +24,7 @@ config NVME_TARGET_PASSTHRU This enables target side NVMe passthru controller support for the NVMe Over Fabrics protocol. It allows for hosts to manage and directly access an actual NVMe controller residing on the target - side, incuding executing Vendor Unique Commands. + side, including executing Vendor Unique Commands. If unsure, say N. From 8c4dfea97f15b80097b3f882ca428fb2751ec30c Mon Sep 17 00:00:00 2001 From: Victor Gladkov Date: Tue, 24 Nov 2020 18:34:59 +0000 Subject: [PATCH 17/23] nvme-fabrics: reject I/O to offline device Commands get stuck while Host NVMe-oF controller is in reconnect state. The controller enters into reconnect state when it loses connection with the target. It tries to reconnect every 10 seconds (default) until a successful reconnect or until the reconnect time-out is reached. The default reconnect time out is 10 minutes. Applications are expecting commands to complete with success or error within a certain timeout (30 seconds by default). The NVMe host is enforcing that timeout while it is connected, but during reconnect the timeout is not enforced and commands may get stuck for a long period or even forever. To fix this long delay due to the default timeout, introduce new "fast_io_fail_tmo" session parameter. The timeout is measured in seconds from the controller reconnect and any command beyond that timeout is rejected. The new parameter value may be passed during 'connect'. The default value of -1 means no timeout (similar to current behavior). Signed-off-by: Victor Gladkov Signed-off-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 46 ++++++++++++++++++++++++++++++++++- drivers/nvme/host/fabrics.c | 25 ++++++++++++++++--- drivers/nvme/host/fabrics.h | 5 ++++ drivers/nvme/host/multipath.c | 2 ++ drivers/nvme/host/nvme.h | 3 +++ 5 files changed, 77 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fff90200497c8..9c1645f28a7a1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -148,6 +148,38 @@ int nvme_try_sched_reset(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_try_sched_reset); +static void nvme_failfast_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, failfast_work); + + if (ctrl->state != NVME_CTRL_CONNECTING) + return; + + set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); + dev_info(ctrl->device, "failfast expired\n"); + nvme_kick_requeue_lists(ctrl); +} + +static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1) + return; + + schedule_delayed_work(&ctrl->failfast_work, + ctrl->opts->fast_io_fail_tmo * HZ); +} + +static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts) + return; + + cancel_delayed_work_sync(&ctrl->failfast_work); + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); +} + + int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) @@ -433,8 +465,17 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, } spin_unlock_irqrestore(&ctrl->lock, flags); - if (changed && ctrl->state == NVME_CTRL_LIVE) + if (!changed) + return false; + + if (ctrl->state == NVME_CTRL_LIVE) { + if (old_state == NVME_CTRL_CONNECTING) + nvme_stop_failfast_work(ctrl); nvme_kick_requeue_lists(ctrl); + } else if (ctrl->state == NVME_CTRL_CONNECTING && + old_state == NVME_CTRL_RESETTING) { + nvme_start_failfast_work(ctrl); + } return changed; } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); @@ -4372,6 +4413,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); + nvme_stop_failfast_work(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); } @@ -4437,6 +4479,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, int ret; ctrl->state = NVME_CTRL_NEW; + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); @@ -4453,6 +4496,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, init_waitqueue_head(&ctrl->state_wq); INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work); memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 8575724734e02..72ac00173500f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -549,6 +549,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, { if (ctrl->state != NVME_CTRL_DELETING_NOIO && ctrl->state != NVME_CTRL_DEAD && + !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; @@ -615,6 +616,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, { NVMF_OPT_TOS, "tos=%d" }, + { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -634,6 +636,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; opts->kato = NVME_DEFAULT_KATO; opts->duplicate_connect = false; + opts->fast_io_fail_tmo = NVMF_DEF_FAIL_FAST_TMO; opts->hdr_digest = false; opts->data_digest = false; opts->tos = -1; /* < 0 == use transport default */ @@ -754,6 +757,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n"); ctrl_loss_tmo = token; break; + case NVMF_OPT_FAIL_FAST_TMO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (token >= 0) + pr_warn("I/O fail on reconnect controller after %d sec\n", + token); + opts->fast_io_fail_tmo = token; + break; case NVMF_OPT_HOSTNQN: if (opts->host) { pr_err("hostnqn already user-assigned: %s\n", @@ -884,11 +898,15 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->nr_poll_queues = 0; opts->duplicate_connect = true; } - if (ctrl_loss_tmo < 0) + if (ctrl_loss_tmo < 0) { opts->max_reconnects = -1; - else + } else { opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, opts->reconnect_delay); + if (ctrl_loss_tmo < opts->fast_io_fail_tmo) + pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", + opts->fast_io_fail_tmo, ctrl_loss_tmo); + } if (!opts->host) { kref_get(&nvmf_default_host->ref); @@ -988,7 +1006,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ - NVMF_OPT_DISABLE_SQFLOW) + NVMF_OPT_DISABLE_SQFLOW |\ + NVMF_OPT_FAIL_FAST_TMO) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a9c1e3b4585ec..733010d2eafd8 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -15,6 +15,8 @@ #define NVMF_DEF_RECONNECT_DELAY 10 /* default to 600 seconds of reconnect attempts before giving up */ #define NVMF_DEF_CTRL_LOSS_TMO 600 +/* default is -1: the fail fast mechanism is disabled */ +#define NVMF_DEF_FAIL_FAST_TMO -1 /* * Define a host as seen by the target. We allocate one at boot, but also @@ -56,6 +58,7 @@ enum { NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, NVMF_OPT_NR_POLL_QUEUES = 1 << 18, NVMF_OPT_TOS = 1 << 19, + NVMF_OPT_FAIL_FAST_TMO = 1 << 20, }; /** @@ -89,6 +92,7 @@ enum { * @nr_write_queues: number of queues for write I/O * @nr_poll_queues: number of queues for polling I/O * @tos: type of service + * @fast_io_fail_tmo: Fast I/O fail timeout in seconds */ struct nvmf_ctrl_options { unsigned mask; @@ -111,6 +115,7 @@ struct nvmf_ctrl_options { unsigned int nr_write_queues; unsigned int nr_poll_queues; int tos; + int fast_io_fail_tmo; }; /* diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74896be40c176..71696819c2280 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -279,6 +279,8 @@ static bool nvme_available_path(struct nvme_ns_head *head) struct nvme_ns *ns; list_for_each_entry_rcu(ns, &head->list, siblings) { + if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) + continue; switch (ns->ctrl->state) { case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 83fb30e317e07..ae017f7277980 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -305,6 +305,7 @@ struct nvme_ctrl { struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + struct delayed_work failfast_work; struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; @@ -338,6 +339,8 @@ struct nvme_ctrl { u16 icdoff; u16 maxcmd; int nr_reconnects; + unsigned long flags; +#define NVME_CTRL_FAILFAST_EXPIRED 0 struct nvmf_ctrl_options *opts; struct page *discard_page; From aa9d729592316e121110daa81604f71f82663167 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Mon, 30 Nov 2020 21:47:46 +0900 Subject: [PATCH 18/23] nvme: improve an error message on Identify failure Add the namespace ID to the error message when the Identify command used to retrieve the Namespace Identification Descriptor list fails. This avoids rather useless and duplicative messages like the following: [ 1.321031] nvme nvme0: Identify Descriptors failed (16386) [ 1.321948] nvme nvme0: Identify Descriptors failed (16386) [ 1.322872] nvme nvme0: Identify Descriptors failed (16386) [ 1.323775] nvme nvme0: Identify Descriptors failed (16386) [ 1.324687] nvme nvme0: Identify Descriptors failed (16386) ... Also, print the nvme status code in hexadecimal rather than decimal format rather for better readability. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9c1645f28a7a1..73c6684aaee94 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1368,7 +1368,8 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, NVME_IDENTIFY_DATA_SIZE); if (status) { dev_warn(ctrl->device, - "Identify Descriptors failed (%d)\n", status); + "Identify Descriptors failed (nsid=%u, status=0x%x)\n", + nsid, status); goto free_data; } From f781f3dd6a165d860c29eeb092af8584284e50f3 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Mon, 30 Nov 2020 21:47:47 +0900 Subject: [PATCH 19/23] nvme: print a warning for when listing active namespaces fails During the scan_work, an Identify command is issued to figure out which namespaces are active. If this command fails, the nvme driver falls back to scanning namespaces sequentially. In this situation, we don't see any warnings and don't even know whether list-ns command has been failed or not easiliy. Printa warning when the Identify command executin fail: [ 1.108399] nvme nvme0: Identify NS List failed (status=0x400b) [ 1.109583] nvme0n1: detected capacity change from 0 to 1048576 [ 1.112186] nvme nvme0: Identify Descriptors failed (nsid=2, status=0x4002) [ 1.113929] nvme nvme0: Identify Descriptors failed (nsid=3, status=0x4002) [ 1.116537] nvme nvme0: Identify Descriptors failed (nsid=4, status=0x4002) ... Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 73c6684aaee94..279f4aea861bc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4110,8 +4110,11 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, NVME_IDENTIFY_DATA_SIZE); - if (ret) + if (ret) { + dev_warn(ctrl->device, + "Identify NS List failed (status=0x%x)\n", ret); goto free; + } for (i = 0; i < nr_entries; i++) { u32 nsid = le32_to_cpu(ns_list[i]); From e1aaf5cacba9d994d825a87a33bdd33343477f16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 1 Dec 2020 13:56:07 +0100 Subject: [PATCH 20/23] nvme: remove unnecessary return values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup unnecessary ret values that are not checked or used in nvme_alloc_ns(). Signed-off-by: Javier González Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 279f4aea861bc..6d0c902034afb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3872,7 +3872,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; + int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; if (nvme_identify_ns(ctrl, nsid, ids, &id)) return; @@ -3896,8 +3896,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, ns->ctrl = ctrl; kref_init(&ns->kref); - ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED); - if (ret) + if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) goto out_free_queue; nvme_set_disk_name(disk_name, ns, ctrl, &flags); @@ -3916,8 +3915,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, goto out_put_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - ret = nvme_nvm_register(ns, disk_name, node); - if (ret) { + if (nvme_nvm_register(ns, disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); goto out_put_disk; } From f68abd9cc00cce58c5dbe5953ac190d25f1e4f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 1 Dec 2020 13:56:08 +0100 Subject: [PATCH 21/23] nvme: rename controller base dev_t char device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename controller base dev_t char device in preparation for adding a namespace char device. Signed-off-by: Javier González Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6d0c902034afb..5aaaae7884e52 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -85,7 +85,7 @@ static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); static DEFINE_IDA(nvme_instance_ida); -static dev_t nvme_chr_devt; +static dev_t nvme_ctrl_base_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; @@ -4517,7 +4517,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, device_initialize(&ctrl->ctrl_device); ctrl->device = &ctrl->ctrl_device; - ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance); + ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), + ctrl->instance); ctrl->device->class = nvme_class; ctrl->device->parent = ctrl->dev; ctrl->device->groups = nvme_dev_attr_groups; @@ -4726,7 +4727,8 @@ static int __init nvme_core_init(void) if (!nvme_delete_wq) goto destroy_reset_wq; - result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); + result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0, + NVME_MINORS, "nvme"); if (result < 0) goto destroy_delete_wq; @@ -4747,7 +4749,7 @@ static int __init nvme_core_init(void) destroy_class: class_destroy(nvme_class); unregister_chrdev: - unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); + unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_delete_wq: destroy_workqueue(nvme_delete_wq); destroy_reset_wq: @@ -4762,7 +4764,7 @@ static void __exit nvme_core_exit(void) { class_destroy(nvme_subsys_class); class_destroy(nvme_class); - unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); + unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); From ba4fb3205680ade6c29c80102e86b88641709561 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 1 Dec 2020 13:56:09 +0100 Subject: [PATCH 22/23] nvme: rename bdev operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remane block device operations in preparation to add char device file operations. Signed-off-by: Javier González Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5aaaae7884e52..1520773d545fd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2334,7 +2334,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ -static const struct block_device_operations nvme_fops = { +static const struct block_device_operations nvme_bdev_ops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, @@ -3342,7 +3342,7 @@ static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); - if (disk->fops == &nvme_fops) + if (disk->fops == &nvme_bdev_ops) return nvme_get_ns_from_dev(dev)->head; else return disk->private_data; @@ -3451,7 +3451,7 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, } #ifdef CONFIG_NVME_MULTIPATH if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { - if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */ + if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ return 0; if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) return 0; @@ -3904,7 +3904,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!disk) goto out_unlink_ns; - disk->fops = &nvme_fops; + disk->fops = &nvme_bdev_ops; disk->private_data = ns; disk->queue = ns->queue; disk->flags = flags; From 2f4c9ba23b887e7a69a474e9d53f38b5833a2119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 1 Dec 2020 13:02:21 +0100 Subject: [PATCH 23/23] nvme: export zoned namespaces without Zone Append support read-only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow ZNS NVMe SSDs to present a read-only namespace when append is not supported, instead of rejecting the namespace directly. This allows (i) the namespace to be used in read-only mode, which is not a problem as the append command only affects the write path, and (ii) to use standard management tools such as nvme-cli to choose a different format or firmware slot that is compatible with the Linux zoned block device. Signed-off-by: Javier González Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/zns.c | 13 +++++++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1520773d545fd..99f91efe38240 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2125,7 +2125,8 @@ static void nvme_update_disk_info(struct gendisk *disk, nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); - if (id->nsattr & NVME_NS_ATTR_RO) + if ((id->nsattr & NVME_NS_ATTR_RO) || + test_bit(NVME_NS_FORCE_RO, &ns->flags)) set_disk_ro(disk, true); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ae017f7277980..bfcedfa4b0570 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -452,6 +452,7 @@ struct nvme_ns { #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 +#define NVME_NS_FORCE_RO 3 struct nvme_fault_inject fault_inject; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 67e87e9f306f1..1dfe9a3500e3a 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -55,12 +55,17 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) int status; /* Driver requires zone append support */ - if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & + if ((le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & NVME_CMD_EFFECTS_CSUPP)) { + if (test_and_clear_bit(NVME_NS_FORCE_RO, &ns->flags)) + dev_warn(ns->ctrl->device, + "Zone Append supported for zoned namespace:%d. Remove read-only mode\n", + ns->head->ns_id); + } else { + set_bit(NVME_NS_FORCE_RO, &ns->flags); dev_warn(ns->ctrl->device, - "append not supported for zoned namespace:%d\n", - ns->head->ns_id); - return -EINVAL; + "Zone Append not supported for zoned namespace:%d. Forcing to read-only mode\n", + ns->head->ns_id); } /* Lazily query controller append limit for the first zoned namespace */