From 8f864c595bed20ef85fef3e7314212b73800d51d Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Fri, 16 Apr 2021 10:45:21 +0800 Subject: [PATCH 1/6] nvmet: avoid queuing keep-alive timer if it is disabled Issue following command: nvme set-feature -f 0xf -v 0 /dev/nvme1n1 # disable keep-alive timer nvme admin-passthru -o 0x18 /dev/nvme1n1 # send keep-alive command will make keep-alive timer fired and thus delete the controller like below: [247459.907635] nvmet: ctrl 1 keep-alive timer (0 seconds) expired! [247459.930294] nvmet: ctrl 1 fatal error occurred! Avoid this by not queuing delayed keep-alive if it is disabled when keep-alive command is received from the admin queue. Signed-off-by: Hou Pu Tested-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index f4cc32674edd0..d2a26ff3f7b31 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -919,15 +919,21 @@ void nvmet_execute_async_event(struct nvmet_req *req) void nvmet_execute_keep_alive(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = 0; if (!nvmet_check_transfer_len(req, 0)) return; + if (!ctrl->kato) { + status = NVME_SC_KA_TIMEOUT_INVALID; + goto out; + } + pr_debug("ctrl %d update keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); - nvmet_req_complete(req, 0); +out: + nvmet_req_complete(req, status); } u16 nvmet_parse_admin_cmd(struct nvmet_req *req) From a70b81bd4d9d2d6c05cfe6ef2a10bccc2e04357a Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 16 Apr 2021 13:46:20 +0200 Subject: [PATCH 2/6] nvme: sanitize KATO setting According to the NVMe base spec the KATO commands should be sent at half of the KATO interval, to properly account for round-trip times. As we now will only ever send one KATO command per connection we can easily use the recommended values. This also fixes a potential issue where the request timeout for the KATO command does not match the value in the connect command, which might be causing spurious connection drops from the target. Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 17 ++++++++++++++--- drivers/nvme/host/fabrics.c | 4 +--- drivers/nvme/host/nvme.h | 1 - 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 40f08e6325ef0..0cb097cd6a8e6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1109,6 +1109,17 @@ void nvme_execute_passthru_rq(struct request *rq) } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); +/* + * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: + * + * The host should send Keep Alive commands at half of the Keep Alive Timeout + * accounting for transport roundtrip times [..]. + */ +static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) +{ + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2); +} + static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; @@ -1131,7 +1142,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) startka = true; spin_unlock_irqrestore(&ctrl->lock, flags); if (startka) - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); } static int nvme_keep_alive(struct nvme_ctrl *ctrl) @@ -1161,7 +1172,7 @@ static void nvme_keep_alive_work(struct work_struct *work) dev_dbg(ctrl->device, "reschedule traffic based keep-alive timer\n"); ctrl->comp_seen = false; - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); return; } @@ -1178,7 +1189,7 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) if (unlikely(ctrl->kato == 0)) return; - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); } void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 604ab0e5a2adb..13c2747e3d00d 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -379,10 +379,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) /* * Set keep-alive timeout in seconds granularity (ms * 1000) - * and add a grace period for controller kato enforcement */ - cmd.connect.kato = ctrl->kato ? - cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0; + cmd.connect.kato = cpu_to_le32(ctrl->kato * 1000); if (ctrl->opts->disable_sqflow) cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c6102ce83bb40..49276186d5bd6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -27,7 +27,6 @@ extern unsigned int admin_timeout; #define NVME_ADMIN_TIMEOUT (admin_timeout * HZ) #define NVME_DEFAULT_KATO 5 -#define NVME_KATO_GRACE 10 #ifdef CONFIG_ARCH_NO_SG_CHAIN #define NVME_INLINE_SG_CNT 0 From 74c22990f08c9f922f775939a4ebc814ca2c49eb Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 16 Apr 2021 13:46:21 +0200 Subject: [PATCH 3/6] nvme: add 'kato' sysfs attribute Add a 'kato' controller sysfs attribute to display the current keep-alive timeout value (if any). This allows userspace to identify persistent discovery controllers, as these will have a non-zero KATO value. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0cb097cd6a8e6..d6fd44774e9f6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3172,6 +3172,7 @@ nvme_show_int_function(cntlid); nvme_show_int_function(numa_node); nvme_show_int_function(queue_count); nvme_show_int_function(sqsize); +nvme_show_int_function(kato); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -3369,6 +3370,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_ctrl_loss_tmo.attr, &dev_attr_reconnect_delay.attr, &dev_attr_fast_io_fail_tmo.attr, + &dev_attr_kato.attr, NULL }; From 53fe2a30bc168db9700e00206d991ff934973cf1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Apr 2021 11:46:12 +0200 Subject: [PATCH 4/6] nvme: do not try to reconfigure APST when the controller is not live Do not call nvme_configure_apst when the controller is not live, given that nvme_configure_apst will fail due the lack of an admin queue when the controller is being torn down and nvme_set_latency_tolerance is called from dev_pm_qos_hide_latency_tolerance. Fixes: 510a405d945b("nvme: fix memory leak for power latency tolerance") Reported-by: Peng Liu Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d6fd44774e9f6..11d343c420b69 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2321,7 +2321,8 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val) if (ctrl->ps_max_latency_us != latency) { ctrl->ps_max_latency_us = latency; - nvme_configure_apst(ctrl); + if (ctrl->state == NVME_CTRL_LIVE) + nvme_configure_apst(ctrl); } } From 60df5de9b0532aff59a00475b57c265b4a3620e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Apr 2021 08:47:44 +0200 Subject: [PATCH 5/6] nvme: cleanup nvme_configure_apst Remove a level of indentation from the main code implementating the table search by using a goto for the APST not supported case. Also move the main comment above the function. Signed-off-by: Christoph Hellwig Reviewed-by: Niklas Cassel --- drivers/nvme/host/core.c | 149 ++++++++++++++++++--------------------- 1 file changed, 69 insertions(+), 80 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 11d343c420b69..b905f91f14eba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2181,28 +2181,28 @@ static int nvme_configure_acre(struct nvme_ctrl *ctrl) return ret; } +/* + * APST (Autonomous Power State Transition) lets us program a table of power + * state transitions that the controller will perform automatically. + * We configure it with a simple heuristic: we are willing to spend at most 2% + * of the time transitioning between power states. Therefore, when running in + * any given state, we will enter the next lower-power non-operational state + * after waiting 50 * (enlat + exlat) microseconds, as long as that state's exit + * latency is under the requested maximum latency. + * + * We will not autonomously enter any non-operational state for which the total + * latency exceeds ps_max_latency_us. + * + * Users can set ps_max_latency_us to zero to turn off APST. + */ static int nvme_configure_apst(struct nvme_ctrl *ctrl) { - /* - * APST (Autonomous Power State Transition) lets us program a - * table of power state transitions that the controller will - * perform automatically. We configure it with a simple - * heuristic: we are willing to spend at most 2% of the time - * transitioning between power states. Therefore, when running - * in any given state, we will enter the next lower-power - * non-operational state after waiting 50 * (enlat + exlat) - * microseconds, as long as that state's exit latency is under - * the requested maximum latency. - * - * We will not autonomously enter any non-operational state for - * which the total latency exceeds ps_max_latency_us. Users - * can set ps_max_latency_us to zero to turn off APST. - */ - - unsigned apste; struct nvme_feat_auto_pst *table; + unsigned apste = 0; u64 max_lat_us = 0; + __le64 target = 0; int max_ps = -1; + int state; int ret; /* @@ -2223,83 +2223,72 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl) if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { /* Turn off APST. */ - apste = 0; dev_dbg(ctrl->device, "APST disabled\n"); - } else { - __le64 target = cpu_to_le64(0); - int state; - - /* - * Walk through all states from lowest- to highest-power. - * According to the spec, lower-numbered states use more - * power. NPSS, despite the name, is the index of the - * lowest-power state, not the number of states. - */ - for (state = (int)ctrl->npss; state >= 0; state--) { - u64 total_latency_us, exit_latency_us, transition_ms; - - if (target) - table->entries[state] = target; - - /* - * Don't allow transitions to the deepest state - * if it's quirked off. - */ - if (state == ctrl->npss && - (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) - continue; - - /* - * Is this state a useful non-operational state for - * higher-power states to autonomously transition to? - */ - if (!(ctrl->psd[state].flags & - NVME_PS_FLAGS_NON_OP_STATE)) - continue; - - exit_latency_us = - (u64)le32_to_cpu(ctrl->psd[state].exit_lat); - if (exit_latency_us > ctrl->ps_max_latency_us) - continue; + goto done; + } - total_latency_us = - exit_latency_us + - le32_to_cpu(ctrl->psd[state].entry_lat); + /* + * Walk through all states from lowest- to highest-power. + * According to the spec, lower-numbered states use more power. NPSS, + * despite the name, is the index of the lowest-power state, not the + * number of states. + */ + for (state = (int)ctrl->npss; state >= 0; state--) { + u64 total_latency_us, exit_latency_us, transition_ms; - /* - * This state is good. Use it as the APST idle - * target for higher power states. - */ - transition_ms = total_latency_us + 19; - do_div(transition_ms, 20); - if (transition_ms > (1 << 24) - 1) - transition_ms = (1 << 24) - 1; + if (target) + table->entries[state] = target; - target = cpu_to_le64((state << 3) | - (transition_ms << 8)); + /* + * Don't allow transitions to the deepest state if it's quirked + * off. + */ + if (state == ctrl->npss && + (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) + continue; - if (max_ps == -1) - max_ps = state; + /* + * Is this state a useful non-operational state for higher-power + * states to autonomously transition to? + */ + if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE)) + continue; - if (total_latency_us > max_lat_us) - max_lat_us = total_latency_us; - } + exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat); + if (exit_latency_us > ctrl->ps_max_latency_us) + continue; - apste = 1; + total_latency_us = exit_latency_us + + le32_to_cpu(ctrl->psd[state].entry_lat); - if (max_ps == -1) { - dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); - } else { - dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", - max_ps, max_lat_us, (int)sizeof(*table), table); - } + /* + * This state is good. Use it as the APST idle target for + * higher power states. + */ + transition_ms = total_latency_us + 19; + do_div(transition_ms, 20); + if (transition_ms > (1 << 24) - 1) + transition_ms = (1 << 24) - 1; + + target = cpu_to_le64((state << 3) | (transition_ms << 8)); + if (max_ps == -1) + max_ps = state; + if (total_latency_us > max_lat_us) + max_lat_us = total_latency_us; } + if (max_ps == -1) + dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); + else + dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", + max_ps, max_lat_us, (int)sizeof(*table), table); + apste = 1; + +done: ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, table, sizeof(*table), NULL); if (ret) dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); - kfree(table); return ret; } From 2637baed78010eeaae274feb5b99ce90933fadfb Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 21 Apr 2021 16:45:04 +0900 Subject: [PATCH 6/6] nvme: introduce generic per-namespace chardev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Userspace has not been allowed to I/O to device that's failed to be initialized. This patch introduces generic per-namespace character device to allow userspace to I/O regardless the block device is there or not. The chardev naming convention will similar to the existing blkdev naming, using a ng prefix instead of nvme, i.e. - /dev/ngXnY It also supports multipath which means it will not expose chardev for the hidden namespace blkdevs (e.g., nvmeXcYnZ). If /dev/ngXnY is created for a ns_head, then I/O request will be routed to a specific controller selected by the iopolicy of the subsystem. Signed-off-by: Minwoo Im Signed-off-by: Javier González Reviewed-by: Keith Busch Tested-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 87 +++++++++++++++++++++++++++++++++++ drivers/nvme/host/ioctl.c | 38 ++++++++++++--- drivers/nvme/host/multipath.c | 51 ++++++++++++++++++-- drivers/nvme/host/nvme.h | 13 ++++++ 4 files changed, 180 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b905f91f14eba..2f45e8fcdd7cb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,6 +89,10 @@ static dev_t nvme_ctrl_base_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; +static DEFINE_IDA(nvme_ns_chr_minor_ida); +static dev_t nvme_ns_chr_devt; +static struct class *nvme_ns_chr_class; + static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -3429,6 +3433,66 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, return 0; } +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) +{ + cdev_device_del(cdev, cdev_device); + ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(cdev_device->devt)); +} + +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner) +{ + int minor, ret; + + minor = ida_simple_get(&nvme_ns_chr_minor_ida, 0, 0, GFP_KERNEL); + if (minor < 0) + return minor; + cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); + cdev_device->class = nvme_ns_chr_class; + device_initialize(cdev_device); + cdev_init(cdev, fops); + cdev->owner = owner; + ret = cdev_device_add(cdev, cdev_device); + if (ret) + ida_simple_remove(&nvme_ns_chr_minor_ida, minor); + return ret; +} + +static int nvme_ns_chr_open(struct inode *inode, struct file *file) +{ + return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev)); +} + +static int nvme_ns_chr_release(struct inode *inode, struct file *file) +{ + nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev)); + return 0; +} + +static const struct file_operations nvme_ns_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_chr_open, + .release = nvme_ns_chr_release, + .unlocked_ioctl = nvme_ns_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_cdev(struct nvme_ns *ns) +{ + int ret; + + ns->cdev_device.parent = ns->ctrl->device; + ret = dev_set_name(&ns->cdev_device, "ng%dn%d", + ns->ctrl->instance, ns->head->instance); + if (ret) + return ret; + ret = nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, + ns->ctrl->ops->module); + if (ret) + kfree_const(ns->cdev_device.kobj.name); + return ret; +} + static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { @@ -3630,6 +3694,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, nvme_get_ctrl(ctrl); device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); + if (!nvme_ns_head_multipath(ns->head)) + nvme_add_ns_cdev(ns); nvme_mpath_add_disk(ns, id); nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); @@ -3674,6 +3740,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ if (ns->disk->flags & GENHD_FL_UP) { + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) @@ -4464,8 +4532,24 @@ static int __init nvme_core_init(void) result = PTR_ERR(nvme_subsys_class); goto destroy_class; } + + result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, + "nvme-generic"); + if (result < 0) + goto destroy_subsys_class; + + nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic"); + if (IS_ERR(nvme_ns_chr_class)) { + result = PTR_ERR(nvme_ns_chr_class); + goto unregister_generic_ns; + } + return 0; +unregister_generic_ns: + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); +destroy_subsys_class: + class_destroy(nvme_subsys_class); destroy_class: class_destroy(nvme_class); unregister_chrdev: @@ -4482,12 +4566,15 @@ static int __init nvme_core_init(void) static void __exit nvme_core_exit(void) { + class_destroy(nvme_ns_chr_class); class_destroy(nvme_subsys_class); class_destroy(nvme_class); + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); + ida_destroy(&nvme_ns_chr_minor_ida); ida_destroy(&nvme_instance_ida); } diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 8e05d65c9e934..502f8e4a2a1f0 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -346,15 +346,27 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, } } +static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg) +{ + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, arg); + return nvme_ns_ioctl(ns, cmd, arg); +} + int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns *ns = bdev->bd_disk->private_data; - void __user *argp = (void __user *)arg; - if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, argp); - return nvme_ns_ioctl(ns, cmd, argp); + return __nvme_ioctl(ns, cmd, (void __user *)arg); +} + +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = + container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); + + return __nvme_ioctl(ns, cmd, (void __user *)arg); } #ifdef CONFIG_NVME_MULTIPATH @@ -388,10 +400,24 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns_head *head = bdev->bd_disk->private_data; + void __user *argp = (void __user *)arg; + + if (is_ctrl_ioctl(cmd)) + return nvme_ns_head_ctrl_ioctl(head, cmd, argp); + return nvme_ns_head_ns_ioctl(head, cmd, argp); +} + +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct cdev *cdev = file_inode(file)->i_cdev; + struct nvme_ns_head *head = + container_of(cdev, struct nvme_ns_head, cdev); + void __user *argp = (void __user *)arg; if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(head, cmd, (void __user *)arg); - return nvme_ns_head_ns_ioctl(head, cmd, (void __user *)arg); + return nvme_ns_head_ctrl_ioctl(head, cmd, argp); + return nvme_ns_head_ns_ioctl(head, cmd, argp); } #endif /* CONFIG_NVME_MULTIPATH */ diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 68918ea1d3d09..0d0de3433f377 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -357,6 +357,48 @@ const struct block_device_operations nvme_ns_head_ops = { .pr_ops = &nvme_pr_ops, }; +static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) +{ + return container_of(cdev, struct nvme_ns_head, cdev); +} + +static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) +{ + if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) + return -ENXIO; + return 0; +} + +static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) +{ + nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); + return 0; +} + +static const struct file_operations nvme_ns_head_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_head_chr_open, + .release = nvme_ns_head_chr_release, + .unlocked_ioctl = nvme_ns_head_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) +{ + int ret; + + head->cdev_device.parent = &head->subsys->dev; + ret = dev_set_name(&head->cdev_device, "ng%dn%d", + head->subsys->instance, head->instance); + if (ret) + return ret; + ret = nvme_cdev_add(&head->cdev, &head->cdev_device, + &nvme_ns_head_chr_fops, THIS_MODULE); + if (ret) + kfree_const(head->cdev_device.kobj.name); + return ret; +} + static void nvme_requeue_work(struct work_struct *work) { struct nvme_ns_head *head = @@ -435,9 +477,11 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) if (!head->disk) return; - if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { device_add_disk(&head->subsys->dev, head->disk, nvme_ns_id_attr_groups); + nvme_add_ns_head_cdev(head); + } mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { @@ -714,8 +758,10 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - if (head->disk->flags & GENHD_FL_UP) + if (head->disk->flags & GENHD_FL_UP) { + nvme_cdev_del(&head->cdev, &head->cdev_device); del_gendisk(head->disk); + } blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); @@ -785,4 +831,3 @@ void nvme_mpath_uninit(struct nvme_ctrl *ctrl) kfree(ctrl->ana_log_buf); ctrl->ana_log_buf = NULL; } - diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 49276186d5bd6..773dde5b231da 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -412,6 +412,10 @@ struct nvme_ns_head { bool shared; int instance; struct nvme_effects_log *effects; + + struct cdev cdev; + struct device cdev_device; + struct gendisk *disk; #ifdef CONFIG_NVME_MULTIPATH struct bio_list requeue_list; @@ -464,6 +468,9 @@ struct nvme_ns { #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 + struct cdev cdev; + struct device cdev_device; + struct nvme_fault_inject fault_inject; }; @@ -658,10 +665,16 @@ void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); bool nvme_tryget_ns_head(struct nvme_ns_head *head); void nvme_put_ns_head(struct nvme_ns_head *head); struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys); +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner); +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device); int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);