From 2a14c9ae15a38148484a128b84bff7e9ffd90d68 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:33 -0700 Subject: [PATCH 01/21] params: lift param_set_uint_minmax to common code It is a useful helper hence move it to common code so others can enjoy it. Suggested-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/moduleparam.h | 2 ++ kernel/params.c | 18 ++++++++++++++++++ net/sunrpc/xprtsock.c | 18 ------------------ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index eed280fae4330..962cd41a2cb5a 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -431,6 +431,8 @@ extern int param_get_int(char *buffer, const struct kernel_param *kp); extern const struct kernel_param_ops param_ops_uint; extern int param_set_uint(const char *val, const struct kernel_param *kp); extern int param_get_uint(char *buffer, const struct kernel_param *kp); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max); #define param_check_uint(name, p) __param_check(name, p, unsigned int) extern const struct kernel_param_ops param_ops_long; diff --git a/kernel/params.c b/kernel/params.c index 2daa2780a92cf..8299bd764e42e 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -243,6 +243,24 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max) +{ + unsigned int num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + if (num < min || num > max) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} +EXPORT_SYMBOL_GPL(param_set_uint_minmax); + int param_set_charp(const char *val, const struct kernel_param *kp) { if (strlen(val) > 1024) { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e573dcecdd66f..b7dbdcbdeb6c7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -3149,24 +3149,6 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_bc_tcp_transport); } -static int param_set_uint_minmax(const char *val, - const struct kernel_param *kp, - unsigned int min, unsigned int max) -{ - unsigned int num; - int ret; - - if (!val) - return -EINVAL; - ret = kstrtouint(val, 0, &num); - if (ret) - return ret; - if (num < min || num > max) - return -EINVAL; - *((unsigned int *)kp->arg) = num; - return 0; -} - static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, From 27453b45e62da8656739f7e1365ea9318e7b040e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:34 -0700 Subject: [PATCH 02/21] nvme-pci: limit maximum queue depth to 4095 We are going to use the upper 4-bits of the command_id for a generation counter, so enforce the new queue depth upper limit. As we enforce both min and max queue depth, use param_set_uint_minmax istead of open coding it. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index db7a9bee20142..4880badb26d4a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -60,6 +60,8 @@ MODULE_PARM_DESC(sgl_threshold, "Use SGLs when average request segment size is larger or equal to " "this size. Use 0 to disable SGLs."); +#define NVME_PCI_MIN_QUEUE_SIZE 2 +#define NVME_PCI_MAX_QUEUE_SIZE 4095 static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, @@ -68,7 +70,7 @@ static const struct kernel_param_ops io_queue_depth_ops = { static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); -MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); +MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096"); static int io_queue_count_set(const char *val, const struct kernel_param *kp) { @@ -157,14 +159,8 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int ret; - u32 n; - - ret = kstrtou32(val, 10, &n); - if (ret != 0 || n < 2) - return -EINVAL; - - return param_set_uint(val, kp); + return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE, + NVME_PCI_MAX_QUEUE_SIZE); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) From 3b01a9d0caa8276d9ce314e09610f7fb70f49a00 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:35 -0700 Subject: [PATCH 03/21] nvme-tcp: don't check blk_mq_tag_to_rq when receiving pdu data We already validate it when receiving the c2hdata pdu header and this is not changing so this is a redundant check. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8cb15ee5b249e..d649b446da66c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -702,17 +702,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; - struct nvme_tcp_request *req; - struct request *rq; - - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); - if (!rq) { - dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); - return -ENOENT; - } - req = blk_mq_rq_to_pdu(rq); + struct request *rq = + blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); while (true) { int recv_len, ret; From e7006de6c23803799be000a5dcce4d916a36541a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:36 -0700 Subject: [PATCH 04/21] nvme: code command_id with a genctr for use-after-free validation We cannot detect a (perhaps buggy) controller that is sending us a completion for a request that was already completed (for example sending a completion twice), this phenomenon was seen in the wild a few times. So to protect against this, we use the upper 4 msbits of the nvme sqe command_id to use as a 4-bit generation counter and verify it matches the existing request generation that is incrementing on every execution. The 16-bit command_id structure now is constructed by: | xxxx | xxxxxxxxxxxx | gen request tag This means that we are giving up some possible queue depth as 12 bits allow for a maximum queue depth of 4095 instead of 65536, however we never create such long queues anyways so no real harm done. Suggested-by: Keith Busch Signed-off-by: Sagi Grimberg Acked-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 47 +++++++++++++++++++++++++++++++++++++- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 4 ++-- drivers/nvme/host/tcp.c | 26 ++++++++++----------- drivers/nvme/target/loop.c | 4 ++-- 6 files changed, 66 insertions(+), 20 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce33014e3eb08..b9a46c54f714e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1026,7 +1026,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) return BLK_STS_IOERR; } - cmd->common.command_id = req->tag; + nvme_req(req)->genctr++; + cmd->common.command_id = nvme_cid(req); trace_nvme_setup_cmd(req, cmd); return ret; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ab803f91ace17..57d2ac00a6bd6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -152,6 +152,7 @@ enum nvme_quirks { struct nvme_request { struct nvme_command *cmd; union nvme_result result; + u8 genctr; u8 retries; u8 flags; u16 status; @@ -491,6 +492,49 @@ struct nvme_ctrl_ops { int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); }; +/* + * nvme command_id is constructed as such: + * | xxxx | xxxxxxxxxxxx | + * gen request tag + */ +#define nvme_genctr_mask(gen) (gen & 0xf) +#define nvme_cid_install_genctr(gen) (nvme_genctr_mask(gen) << 12) +#define nvme_genctr_from_cid(cid) ((cid & 0xf000) >> 12) +#define nvme_tag_from_cid(cid) (cid & 0xfff) + +static inline u16 nvme_cid(struct request *rq) +{ + return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag; +} + +static inline struct request *nvme_find_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + u8 genctr = nvme_genctr_from_cid(command_id); + u16 tag = nvme_tag_from_cid(command_id); + struct request *rq; + + rq = blk_mq_tag_to_rq(tags, tag); + if (unlikely(!rq)) { + pr_err("could not locate request for tag %#x\n", + tag); + return NULL; + } + if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) { + dev_err(nvme_req(rq)->ctrl->device, + "request %#x genctr mismatch (got %#x expected %#x)\n", + tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr)); + return NULL; + } + return rq; +} + +static inline struct request *nvme_cid_to_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + return blk_mq_tag_to_rq(tags, nvme_tag_from_cid(command_id)); +} + #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, const char *dev_name); @@ -588,7 +632,8 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) { - return !qid && command_id >= NVME_AQ_BLK_MQ_DEPTH; + return !qid && + nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH; } void nvme_complete_rq(struct request *req); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4880badb26d4a..0471c2c7d64b1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1010,7 +1010,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) return; } - req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id); + req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); if (unlikely(!req)) { dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 7f6b3a9915014..69ae67652f38c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1730,10 +1730,10 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, struct request *rq; struct nvme_rdma_request *req; - rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "tag 0x%x on QP %#x not found\n", + "got bad command_id %#x on QP %#x\n", cqe->command_id, queue->qp->qp_num); nvme_rdma_error_recovery(queue->ctrl); return; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d649b446da66c..0a97ba02f61e9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -487,11 +487,11 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag 0x%x not found\n", - nvme_tcp_queue_id(queue), cqe->command_id); + "got bad cqe.command_id %#x on queue %d\n", + cqe->command_id, nvme_tcp_queue_id(queue)); nvme_tcp_error_recovery(&queue->ctrl->ctrl); return -EINVAL; } @@ -508,11 +508,11 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad c2hdata.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } @@ -606,7 +606,7 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, data->hdr.plen = cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); data->ttag = pdu->ttag; - data->command_id = rq->tag; + data->command_id = nvme_cid(rq); data->data_offset = cpu_to_le32(req->data_sent); data->data_length = cpu_to_le32(req->pdu_len); return 0; @@ -619,11 +619,11 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, struct request *rq; int ret; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad r2t.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } req = blk_mq_rq_to_pdu(rq); @@ -703,7 +703,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, { struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; struct request *rq = - blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); while (true) { @@ -796,8 +796,8 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, } if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { - struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), - pdu->command_id); + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); nvme_tcp_end_request(rq, NVME_SC_SUCCESS); queue->nr_cqe++; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 3a17a7e26bbfc..0285ccc7541f6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -107,10 +107,10 @@ static void nvme_loop_queue_response(struct nvmet_req *req) } else { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_loop_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "tag 0x%x on queue %d not found\n", + "got bad command_id %#x on queue %d\n", cqe->command_id, nvme_loop_queue_idx(queue)); return; } From 0521905e859fd1a07949cb18efb20cdd4aab3b20 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 14 Jul 2021 14:02:37 -0700 Subject: [PATCH 05/21] nvme-pci: use attribute group for cmb sysfs Appending sysfs files to the controller kobject is a bit clunky and becomes a maintenance problem as more attributes are added. The attribute group infrastructure handles this better, so use that. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 72 ++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0471c2c7d64b1..6658f58ef8243 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -155,6 +155,8 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; + + bool attrs_added; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -1804,17 +1806,6 @@ static int nvme_create_io_queues(struct nvme_dev *dev) return ret >= 0 ? 0 : ret; } -static ssize_t nvme_cmb_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", - ndev->cmbloc, ndev->cmbsz); -} -static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); - static u64 nvme_cmb_size_unit(struct nvme_dev *dev) { u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; @@ -1883,20 +1874,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); - - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); -} - -static inline void nvme_release_cmb(struct nvme_dev *dev) -{ - if (dev->cmb_size) { - sysfs_remove_file_from_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL); - dev->cmb_size = 0; - } } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -2076,6 +2053,38 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) return ret; } +static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", + ndev->cmbloc, ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmb); + +static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct nvme_ctrl *ctrl = + dev_get_drvdata(container_of(kobj, struct device, kobj)); + struct nvme_dev *dev = to_nvme_dev(ctrl); + + if (a == &dev_attr_cmb.attr && !dev->cmbsz) + return 0; + return a->mode; +} + +static struct attribute *nvme_pci_attrs[] = { + &dev_attr_cmb.attr, + NULL, +}; + +static const struct attribute_group nvme_pci_attr_group = { + .attrs = nvme_pci_attrs, + .is_visible = nvme_pci_attrs_are_visible, +}; + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2747,6 +2756,10 @@ static void nvme_reset_work(struct work_struct *work) goto out; } + if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group)) + dev->attrs_added = true; + nvme_start_ctrl(&dev->ctrl); return; @@ -2995,6 +3008,13 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_disable_prepare_reset(dev, true); } +static void nvme_remove_attrs(struct nvme_dev *dev) +{ + if (dev->attrs_added) + sysfs_remove_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group); +} + /* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in @@ -3016,7 +3036,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); - nvme_release_cmb(dev); + nvme_remove_attrs(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); From 1751e97aa940656b5de0e620f02cf193a275e014 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 16 Jul 2021 09:22:49 +0200 Subject: [PATCH 06/21] nvme-pci: cmb sysfs: one file, one value An attribute should only be exporting one value as recommended in Documentation/filesystems/sysfs.rst. Implement CMB attributes this way. The old attribute will remain for backward compatibility. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6658f58ef8243..909dadcdab09d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2063,6 +2063,24 @@ static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(cmb); +static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbloc); +} +static DEVICE_ATTR_RO(cmbloc); + +static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmbsz); + static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { @@ -2070,13 +2088,19 @@ static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, dev_get_drvdata(container_of(kobj, struct device, kobj)); struct nvme_dev *dev = to_nvme_dev(ctrl); - if (a == &dev_attr_cmb.attr && !dev->cmbsz) - return 0; + if (a == &dev_attr_cmb.attr || + a == &dev_attr_cmbloc.attr || + a == &dev_attr_cmbsz.attr) { + if (!dev->cmbsz) + return 0; + } return a->mode; } static struct attribute *nvme_pci_attrs[] = { &dev_attr_cmb.attr, + &dev_attr_cmbloc.attr, + &dev_attr_cmbsz.attr, NULL, }; From e23439e977ed2b247912c2b5c6945ef1bc380100 Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Fri, 9 Jul 2021 10:32:47 +0800 Subject: [PATCH 07/21] nvme-fabrics: remove superfluous nvmf_host_put in nvmf_parse_options Opts->host is NULL there. It is checked just before. So remove nvmf_host_put. It is introduced by commit 59a2f3f00fd7 ("nvme: fix potential memory leak in option parsing"). Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index a5469fd9d4c31..668c6bb7a567f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -719,7 +719,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -EINVAL; goto out; } - nvmf_host_put(opts->host); opts->host = nvmf_host_add(p); kfree(p); if (!opts->host) { From a7b5e8d864b356fdacfea08d9042261c37bc918e Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Mon, 5 Jul 2021 11:15:28 +0800 Subject: [PATCH 08/21] nvme: add set feature tracing support A nvme connect command produces following trace. Before: /sys/kernel/debug/tracing# cat trace | grep feature kworker/5:1H-98 [005] .... 3221.294844: nvme_setup_cmd: nvme0: qid=0, cmdid=25, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features cdw10=07 00 00 00 07 00 07 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) kworker/4:1H-124 [004] .... 3222.009186: nvme_setup_cmd: nvme0: qid=0, cmdid=17, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features cdw10=0b 00 00 00 00 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) After: /sys/kernel/debug/tracing# cat trace | grep feature kworker/0:1H-253 [000] .... 196.060509: nvme_setup_cmd: nvme0: qid=0, cmdid=29, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features fid=0x7, sv=0x0, cdw11=0x70007) kworker/0:1H-253 [000] .... 196.763947: nvme_setup_cmd: nvme0: qid=0, cmdid=29, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features fid=0xb, sv=0x0, cdw11=0x900) Using ',' to separate different field like others in nvmet_trace_admin_get_features. Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 6543015b61219..2a89c5aa0790b 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -72,6 +72,20 @@ static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_admin_get_features(struct trace_seq *p, u8 *cdw10) { @@ -80,7 +94,7 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p, u8 sel = cdw10[1] & 0x7; u32 cdw11 = get_unaligned_le32(cdw10 + 4); - trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); trace_seq_putc(p, 0); return ret; @@ -201,6 +215,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: return nvme_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvme_trace_admin_set_features(p, cdw10); case nvme_admin_get_features: return nvme_trace_admin_get_features(p, cdw10); case nvme_admin_get_lba_status: From 8d84f9de69ca23f2637dc19d96f39228c8426e97 Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Mon, 5 Jul 2021 11:15:29 +0800 Subject: [PATCH 09/21] nvmet: add set feature tracing support A nvme connect command produces following trace from the target side. Before: kworker/0:1H-56 [000] .... 9012.155139: nvmet_req_init: nvmet1: qid=0, cmdid=16, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, cdw10=07 00 00 00 07 00 07 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) kworker/0:1H-56 [000] .... 9012.872272: nvmet_req_init: nvmet1: qid=0, cmdid=13, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, cdw10=0b 00 00 00 00 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) cmdline:/sys/kernel/debug/tracing# cat trace | grep feature kworker/0:1H-56 [000] .... 203.493914: nvmet_req_init: nvmet1: qid=0, cmdid=29, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, fid=0x7, sv=0x0, cdw11=0x70007) kworker/0:1H-56 [000] .... 204.197079: nvmet_req_init: nvmet1: qid=0, cmdid=29, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, fid=0xb, sv=0x0, cdw11=0x900) Using ',' to separate different field like others in nvmet_trace_admin_get_features. Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/target/trace.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 1373a3c67962a..bff454d46255b 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -27,7 +27,7 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p, u8 sel = cdw10[1] & 0x7; u32 cdw11 = get_unaligned_le32(cdw10 + 4); - trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); trace_seq_putc(p, 0); return ret; @@ -49,6 +49,20 @@ static const char *nvmet_trace_get_lba_status(struct trace_seq *p, return ret; } +static const char *nvmet_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -94,6 +108,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, switch (opcode) { case nvme_admin_identify: return nvmet_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvmet_trace_admin_set_features(p, cdw10); case nvme_admin_get_features: return nvmet_trace_admin_get_features(p, cdw10); case nvme_admin_get_lba_status: From ad0e9a80ba0f20db0f86e23d1ad2979513a9a8ee Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 6 Jul 2021 15:56:50 +0100 Subject: [PATCH 10/21] nvmet: remove redundant assignments of variable status There are two occurrances where variable status is being assigned a value that is never read and it is being re-assigned a new value almost immediately afterwards on an error exit path. The assignments are redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/zns.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 17f8b7a45f218..46bc30fe85d2b 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -115,14 +115,11 @@ void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) } status = nvmet_req_find_ns(req); - if (status) { - status = NVME_SC_INTERNAL; + if (status) goto done; - } if (!bdev_is_zoned(req->ns->bdev)) { req->error_loc = offsetof(struct nvme_identify, nsid); - status = NVME_SC_INVALID_NS | NVME_SC_DNR; goto done; } From e5ad96f388b765fe6b52f64f37e910c0ba4f3de7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 27 Jul 2021 09:40:44 -0700 Subject: [PATCH 11/21] nvme-pci: disable hmb on idle suspend An idle suspend may or may not disable host memory access from devices placed in low power mode. Either way, it should always be safe to disable the host memory buffer prior to entering the low power mode, and this should also always be faster than a full device shutdown. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 909dadcdab09d..5b23d5818f752 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3087,8 +3087,13 @@ static int nvme_resume(struct device *dev) if (ndev->last_ps == U32_MAX || nvme_set_power_state(ctrl, ndev->last_ps) != 0) - return nvme_try_sched_reset(&ndev->ctrl); + goto reset; + if (ctrl->hmpre && nvme_setup_host_mem(ndev)) + goto reset; + return 0; +reset: + return nvme_try_sched_reset(ctrl); } static int nvme_suspend(struct device *dev) @@ -3112,15 +3117,9 @@ static int nvme_suspend(struct device *dev) * the PCI bus layer to put it into D3 in order to take the PCIe link * down, so as to allow the platform to achieve its minimum low-power * state (which may not be possible if the link is up). - * - * If a host memory buffer is enabled, shut down the device as the NVMe - * specification allows the device to access the host memory buffer in - * host DRAM from all power states, but hosts will fail access to DRAM - * during S3. */ if (pm_suspend_via_firmware() || !ctrl->npss || !pcie_aspm_enabled(pdev) || - ndev->nr_host_mem_descs || (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) return nvme_disable_prepare_reset(ndev, true); @@ -3131,6 +3130,17 @@ static int nvme_suspend(struct device *dev) if (ctrl->state != NVME_CTRL_LIVE) goto unfreeze; + /* + * Host memory access may not be successful in a system suspend state, + * but the specification allows the controller to access memory in a + * non-operational power state. + */ + if (ndev->hmb) { + ret = nvme_set_host_mem(ndev, 0); + if (ret < 0) + goto unfreeze; + } + ret = nvme_get_power_state(ctrl, &ndev->last_ps); if (ret < 0) goto unfreeze; From a5df5e79c43c84d9fb88f56b707c5ff52b27ccca Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 27 Jul 2021 09:40:43 -0700 Subject: [PATCH 12/21] nvme: allow user toggling hmb usage The NVMe host memory buffer may consume a non-negligable amount of memory. Controllers are required to function without the host memory buffer enabled, but with possibly degraded performance. Export a sysfs property to toggle this feature on a per-device granularity so users may choose to reclaim memory at the expense of storage performance. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 45 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5b23d5818f752..b82492cd75033 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -137,6 +137,7 @@ struct nvme_dev { u32 cmbloc; struct nvme_ctrl ctrl; u32 last_ps; + bool hmb; mempool_t *iod_mempool; @@ -1896,7 +1897,9 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) dev_warn(dev->ctrl.device, "failed to set host mem (err %d, flags %#x).\n", ret, bits); - } + } else + dev->hmb = bits & NVME_HOST_MEM_ENABLE; + return ret; } @@ -2081,6 +2084,42 @@ static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(cmbsz); +static ssize_t hmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%d\n", ndev->hmb); +} + +static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + bool new; + int ret; + + if (strtobool(buf, &new) < 0) + return -EINVAL; + + if (new == ndev->hmb) + return count; + + if (new) { + ret = nvme_setup_host_mem(ndev); + } else { + ret = nvme_set_host_mem(ndev, 0); + if (!ret) + nvme_free_host_mem(ndev); + } + + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR_RW(hmb); + static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { @@ -2094,6 +2133,9 @@ static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, if (!dev->cmbsz) return 0; } + if (a == &dev_attr_hmb.attr && !ctrl->hmpre) + return 0; + return a->mode; } @@ -2101,6 +2143,7 @@ static struct attribute *nvme_pci_attrs[] = { &dev_attr_cmb.attr, &dev_attr_cmbloc.attr, &dev_attr_cmbsz.attr, + &dev_attr_hmb.attr, NULL, }; From d48f92cd2739258a1292be56bbeadb5b6a57ea09 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 6 Aug 2021 08:41:43 -0700 Subject: [PATCH 13/21] nvme-tcp: pair send_mutex init with destroy Each mutex_init() should have a corresponding mutex_destroy(). Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0a97ba02f61e9..95d4cf777d24d 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1220,6 +1220,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) sock_release(queue->sock); kfree(queue->pdu); + mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); } @@ -1525,6 +1526,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, sock_release(queue->sock); queue->sock = NULL; err_destroy_mutex: + mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); return ret; } From 664227fde63844d69e9ec9e90a8a7801e6ff072d Mon Sep 17 00:00:00 2001 From: Ruozhu Li Date: Sat, 7 Aug 2021 11:50:23 +0800 Subject: [PATCH 14/21] nvme-tcp: don't update queue count when failing to set io queues We update ctrl->queue_count and schedule another reconnect when io queue count is zero.But we will never try to create any io queue in next reco- nnection, because ctrl->queue_count already set to zero.We will end up having an admin-only session in Live state, which is exactly what we try to avoid in the original patch. Update ctrl->queue_count after queue_count zero checking to fix it. Signed-off-by: Ruozhu Li Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 95d4cf777d24d..6450256201542 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1763,13 +1763,13 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) if (ret) return ret; - ctrl->queue_count = nr_io_queues + 1; - if (ctrl->queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->queue_count = nr_io_queues + 1; dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); From 85032874f80ba17bf187de1d14d9603bf3f582b8 Mon Sep 17 00:00:00 2001 From: Ruozhu Li Date: Wed, 28 Jul 2021 17:41:20 +0800 Subject: [PATCH 15/21] nvme-rdma: don't update queue count when failing to set io queues We update ctrl->queue_count and schedule another reconnect when io queue count is zero.But we will never try to create any io queue in next reco- nnection, because ctrl->queue_count already set to zero.We will end up having an admin-only session in Live state, which is exactly what we try to avoid in the original patch. Update ctrl->queue_count after queue_count zero checking to fix it. Signed-off-by: Ruozhu Li Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 69ae67652f38c..a68704e39084e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -735,13 +735,13 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) return ret; - ctrl->ctrl.queue_count = nr_io_queues + 1; - if (ctrl->ctrl.queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->ctrl.device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->ctrl.queue_count = nr_io_queues + 1; dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); From e804d5abe2d74cfe23f5f83be580d1cdc9307111 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Sun, 8 Aug 2021 09:20:14 +0300 Subject: [PATCH 16/21] nvmet: pass back cntlid on successful completion According to the NVMe specification, the response dword 0 value of the Connect command is based on status code: return cntlid for successful compeltion return IPO and IATTR for connect invalid parameters. Fix a missing error information for a zero sized queue, and return the cntlid also for I/O queue Connect commands. Signed-off-by: Amit Engel Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 7d0f3523fdab2..8ef564c3b32c8 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -120,6 +120,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) if (!sqsize) { pr_warn("queue size zero!\n"); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; goto err; } @@ -260,11 +261,11 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) } status = nvmet_install_queue(ctrl, req); - if (status) { - /* pass back cntlid that had the issue of installing queue */ - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); + if (status) goto out_ctrl_put; - } + + /* pass back cntlid for successful completion */ + req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); From b71df12605cabab47d58bd926badaf4130280e4d Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Sun, 8 Aug 2021 18:06:15 +0300 Subject: [PATCH 17/21] nvmet: avoid duplicate qid in connect cmd According to the NVMe specification, if the host sends a Connect command specifying a queue id which has already been created, a status value of NVME_SC_CMD_SEQ_ERROR is returned. Signed-off-by: Amit Engel Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 1 + drivers/nvme/target/fabrics-cmd.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index ac7210a3ea1c5..66d05eecc2a9c 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -802,6 +802,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) * controller teardown as a result of a keep-alive expiration. */ ctrl->reset_tbkas = true; + sq->ctrl->sqs[sq->qid] = NULL; nvmet_ctrl_put(ctrl); sq->ctrl = NULL; /* allows reusing the queue later */ } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 8ef564c3b32c8..6f14da5cd08f1 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -111,12 +111,6 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) struct nvmet_ctrl *old; u16 ret; - old = cmpxchg(&req->sq->ctrl, NULL, ctrl); - if (old) { - pr_warn("queue already connected!\n"); - req->error_loc = offsetof(struct nvmf_connect_command, opcode); - return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; - } if (!sqsize) { pr_warn("queue size zero!\n"); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); @@ -125,6 +119,19 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) goto err; } + if (ctrl->sqs[qid] != NULL) { + pr_warn("qid %u has already been created\n", qid); + req->error_loc = offsetof(struct nvmf_connect_command, qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, opcode); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + } + /* note: convert queue size from 0's-based value to 1's-based value */ nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); @@ -139,6 +146,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) if (ret) { pr_err("failed to install queue %d cntlid %d ret %x\n", qid, ctrl->cntlid, ret); + ctrl->sqs[qid] = NULL; goto err; } } From e19e9f47f341cafcaf41253723f083223a4652a5 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Thu, 5 Aug 2021 18:02:51 +0300 Subject: [PATCH 18/21] nvmet: check that host sqsize does not exceed ctrl MQES Check that host sqsize is not greater-than Maximum Queue Entries Supported (MQES) value supported by the controller. Signed-off-by: Amit Engel Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 6f14da5cd08f1..7d0454cee9205 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -109,6 +109,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) u16 qid = le16_to_cpu(c->qid); u16 sqsize = le16_to_cpu(c->sqsize); struct nvmet_ctrl *old; + u16 mqes = NVME_CAP_MQES(ctrl->cap); u16 ret; if (!sqsize) { @@ -125,6 +126,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } + if (sqsize > mqes) { + pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n", + sqsize, mqes, ctrl->cntlid); + req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); if (old) { pr_warn("queue already connected!\n"); From 0866200ed7fdfbfba0c033aad63ff407e5368570 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 May 2021 08:59:46 -0700 Subject: [PATCH 19/21] nvme: Have NVME_FABRICS select NVME_CORE instead of transport drivers Transport drivers need both core and fabrics modules, instead of selecting both, have the selection transitive such that NVME_FABRICS selects NVME_CORE and transport drivers select NVME_FABRICS. Suggested-by: Keith Busch Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/Kconfig | 4 +--- drivers/nvme/target/Kconfig | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index c3f3d77f1aacc..dc0450ca23a30 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -33,12 +33,12 @@ config NVME_HWMON in the system. config NVME_FABRICS + select NVME_CORE tristate config NVME_RDMA tristate "NVM Express over Fabrics RDMA host driver" depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -55,7 +55,6 @@ config NVME_FC tristate "NVM Express over Fabrics FC host driver" depends on BLOCK depends on HAS_DMA - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -72,7 +71,6 @@ config NVME_TCP tristate "NVM Express over Fabrics TCP host driver" depends on INET depends on BLOCK - select NVME_CORE select NVME_FABRICS select CRYPTO select CRYPTO_CRC32C diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4be2ececbc45b..973561c93888f 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -31,7 +31,6 @@ config NVME_TARGET_PASSTHRU config NVME_TARGET_LOOP tristate "NVMe loopback device support" depends on NVME_TARGET - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -65,7 +64,6 @@ config NVME_TARGET_FC config NVME_TARGET_FCLOOP tristate "NVMe over Fabrics FC Transport Loopback Test driver" depends on NVME_TARGET - select NVME_CORE select NVME_FABRICS select SG_POOL depends on NVME_FC From 77979058dfcf4818abf7dd84423a7d66dafd8487 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 16 Aug 2021 09:24:52 -0700 Subject: [PATCH 20/21] nvme: remove nvm_ndev from ns Now that the lightnvm driver is removed, we don't need a pointer to it's now non-existent struct. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 57d2ac00a6bd6..37c5ef5a33314 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -444,7 +444,6 @@ struct nvme_ns { u32 ana_grpid; #endif struct list_head siblings; - struct nvm_dev *ndev; struct kref kref; struct nvme_ns_head *head; From 9891668e43c8e9f2d0d50088b151edefc2e560e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:45:29 +0200 Subject: [PATCH 21/21] nvme: remove the unused NVME_NS_* enum These values are unused now that the lightnvm support is gone. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/nvme.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 37c5ef5a33314..a2e1f298b217c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -47,11 +47,6 @@ extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_reset_wq; extern struct workqueue_struct *nvme_delete_wq; -enum { - NVME_NS_LBA = 0, - NVME_NS_LIGHTNVM = 1, -}; - /* * List of workarounds for devices that required behavior not specified in * the standard.