From 98e5440271172ca5c654784a31e46453464a3ca9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:40 -0700 Subject: [PATCH 01/32] block: Fix three kernel-doc warnings Fix the following kernel-doc warnings: block/t10-pi.c:242: warning: Function parameter or member 'rq' not described in 't10_pi_type3_prepare' block/t10-pi.c:249: warning: Function parameter or member 'rq' not described in 't10_pi_type3_complete' block/t10-pi.c:249: warning: Function parameter or member 'nr_bytes' not described in 't10_pi_type3_complete' Cc: Max Gurtovoy Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Fixes: 54d4e6ab91eb ("block: centralize PI remapping logic to the block layer") Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/t10-pi.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index 9803c7e0376ec..f4907d941f034 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -235,16 +235,12 @@ static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); } -/** - * Type 3 does not have a reference tag so no remapping is required. - */ +/* Type 3 does not have a reference tag so no remapping is required. */ static void t10_pi_type3_prepare(struct request *rq) { } -/** - * Type 3 does not have a reference tag so no remapping is required. - */ +/* Type 3 does not have a reference tag so no remapping is required. */ static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) { } From 1d200e9d6f635ae894993a7d0f1b9e0b6e522e3b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:41 -0700 Subject: [PATCH 02/32] block: Fix writeback throttling W=1 compiler warnings Fix the following compiler warnings: In file included from ./include/linux/bitmap.h:9, from ./include/linux/cpumask.h:12, from ./arch/x86/include/asm/cpumask.h:5, from ./arch/x86/include/asm/msr.h:11, from ./arch/x86/include/asm/processor.h:21, from ./arch/x86/include/asm/cpufeature.h:5, from ./arch/x86/include/asm/thread_info.h:53, from ./include/linux/thread_info.h:38, from ./arch/x86/include/asm/preempt.h:7, from ./include/linux/preempt.h:78, from ./include/linux/spinlock.h:51, from ./include/linux/mmzone.h:8, from ./include/linux/gfp.h:6, from ./include/linux/mm.h:10, from ./include/linux/bvec.h:13, from ./include/linux/blk_types.h:10, from block/blk-wbt.c:23: In function 'strncpy', inlined from 'perf_trace_wbt_stat' at ./include/trace/events/wbt.h:15:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'perf_trace_wbt_lat' at ./include/trace/events/wbt.h:58:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'perf_trace_wbt_step' at ./include/trace/events/wbt.h:87:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'perf_trace_wbt_timer' at ./include/trace/events/wbt.h:126:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_stat' at ./include/trace/events/wbt.h:15:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_lat' at ./include/trace/events/wbt.h:58:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_timer' at ./include/trace/events/wbt.h:126:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_step' at ./include/trace/events/wbt.h:87:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Fixes: e34cbd307477 ("blk-wbt: add general throttling mechanism"; v4.10). Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/trace/events/wbt.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h index b048694070e2c..37342a13c9cb9 100644 --- a/include/trace/events/wbt.h +++ b/include/trace/events/wbt.h @@ -33,7 +33,8 @@ TRACE_EVENT(wbt_stat, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->rmean = stat[0].mean; __entry->rmin = stat[0].min; __entry->rmax = stat[0].max; @@ -67,7 +68,8 @@ TRACE_EVENT(wbt_lat, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->lat = div_u64(lat, 1000); ), @@ -103,7 +105,8 @@ TRACE_EVENT(wbt_step, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->msg = msg; __entry->step = step; __entry->window = div_u64(window, 1000); @@ -138,7 +141,8 @@ TRACE_EVENT(wbt_timer, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->status = status; __entry->step = step; __entry->inflight = inflight; From 9566256518de0520c964bdf23140eac324b136af Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:42 -0700 Subject: [PATCH 03/32] block: Remove request_queue.nr_queues Commit 897bb0c7f1ea ("blk-mq: Use proper cpumask iterator"; v4.6) removed the last use of request_queue.nr_queues from outside blk_mq_init_allocate_queue(). Remove this member variable to make struct request_queue smaller. This patch does not change any functionality. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- include/linux/blkdev.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ec791156e9ccd..9d932939b576a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2876,9 +2876,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->nr_queues = nr_hw_queues(set); - q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), - GFP_KERNEL, set->numa_node); + q->queue_hw_ctx = kcalloc_node(nr_hw_queues(set), + sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, + set->numa_node); if (!q->queue_hw_ctx) goto err_sys_init; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3ea78b0c91cf..d4051acb92a17 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -411,7 +411,6 @@ struct request_queue { /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; - unsigned int nr_queues; unsigned int queue_depth; From bae85c156f619939ef6261f1bd4fabbe24361b50 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:43 -0700 Subject: [PATCH 04/32] block: Remove "dying" checks from sysfs callbacks Block drivers must call del_gendisk() before blk_cleanup_queue(). del_gendisk() calls kobject_del() and kobject_del() waits until any ongoing sysfs callback functions have finished. In other words, the sysfs callback functions won't be called for a queue in the dying state. Hence remove the "dying" checks from the sysfs callback functions. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk-mq-sysfs.c | 16 ++++------------ block/blk-sysfs.c | 8 -------- 3 files changed, 6 insertions(+), 20 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index d5e668ec751b5..8b51d9ec8ae3e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -336,6 +336,8 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying); */ void blk_cleanup_queue(struct request_queue *q) { + WARN_ON_ONCE(blk_queue_registered(q)); + /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index a0d3ce30fa089..81a273b433298 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -74,10 +74,8 @@ static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, if (!entry->show) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->show(ctx, page); + res = entry->show(ctx, page); mutex_unlock(&q->sysfs_lock); return res; } @@ -97,10 +95,8 @@ static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->store(ctx, page, length); + res = entry->store(ctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } @@ -120,10 +116,8 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->show(hctx, page); + res = entry->show(hctx, page); mutex_unlock(&q->sysfs_lock); return res; } @@ -144,10 +138,8 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, if (!entry->store) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->store(hctx, page, length); + res = entry->store(hctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 46f5198be0173..fca9b158f4a09 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -801,10 +801,6 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; mutex_lock(&q->sysfs_lock); - if (blk_queue_dying(q)) { - mutex_unlock(&q->sysfs_lock); - return -ENOENT; - } res = entry->show(q, page); mutex_unlock(&q->sysfs_lock); return res; @@ -823,10 +819,6 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); - if (blk_queue_dying(q)) { - mutex_unlock(&q->sysfs_lock); - return -ENOENT; - } res = entry->store(q, page, length); mutex_unlock(&q->sysfs_lock); return res; From 73f1c77e65117e8f44074402e7cf5f4934505bfb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:44 -0700 Subject: [PATCH 05/32] block: Reduce sysfs_lock locking inside blk_cleanup_queue() Since blk_cleanup_queue() is called after blk_unregister_queue() and since that last function removes all sysfs attributes, serializing any code in blk_cleanup_queue() against sysfs callback methods nor against I/O scheduler changes is necessary. Hence remove the syfs_lock locking calls from the start of blk_cleanup_queue(). Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 8b51d9ec8ae3e..ae506ac2dd489 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -339,13 +339,11 @@ void blk_cleanup_queue(struct request_queue *q) WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ - mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); blk_queue_flag_set(QUEUE_FLAG_DYING, q); - mutex_unlock(&q->sysfs_lock); /* * Drain all requests queued before DYING marking. Set DEAD flag to From 7a18312c739aeace7c8ea448d39a0313d5ad5d5d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:45 -0700 Subject: [PATCH 06/32] block: Document all members of blk_mq_tag_set and bkl_mq_queue_map The meaning of several member variables of these two data structures is nontrivial. Hence document all member variables using the kernel-doc syntax. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 54 +++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0bf056de5cc3f..a96b5cc957aba 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -76,6 +76,16 @@ struct blk_mq_hw_ctx { struct srcu_struct srcu[0]; }; +/** + * struct blk_mq_queue_map - ctx -> hctx mapping + * @mq_map: CPU ID to hardware queue index map. This is an array + * with nr_cpu_ids elements. Each element has a value in the range + * [@queue_offset, @queue_offset + @nr_queues). + * @nr_queues: Number of hardware queues to map CPU IDs onto. + * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe + * driver to map each hardware queue type (enum hctx_type) onto a distinct + * set of hardware queues. + */ struct blk_mq_queue_map { unsigned int *mq_map; unsigned int nr_queues; @@ -90,23 +100,45 @@ enum hctx_type { HCTX_MAX_TYPES, }; +/** + * struct blk_mq_tag_set - tag set that can be shared between request queues + * @map: One or more ctx -> hctx mappings. One map exists for each + * hardware queue type (enum hctx_type) that the driver wishes + * to support. There are no restrictions on maps being of the + * same size, and it's perfectly legal to share maps between + * types. + * @nr_maps: Number of elements in the @map array. A number in the range + * [1, HCTX_MAX_TYPES]. + * @ops: Pointers to functions that implement block driver behavior. + * @nr_hw_queues: Number of hardware queues supported by the block driver that + * owns this data structure. + * @queue_depth: Number of tags per hardware queue, reserved tags included. + * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag + * allocations. + * @cmd_size: Number of additional bytes to allocate per request. The block + * driver owns these additional bytes. + * @numa_node: NUMA node the storage adapter has been connected to. + * @timeout: Request processing timeout in jiffies. + * @flags: Zero or more BLK_MQ_F_* flags. + * @driver_data: Pointer to data owned by the block driver that created this + * tag set. + * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues + * elements. + * @tag_list_lock: Serializes tag_list accesses. + * @tag_list: List of the request queues that use this tag set. See also + * request_queue.tag_set_list. + */ struct blk_mq_tag_set { - /* - * map[] holds ctx -> hctx mappings, one map exists for each type - * that the driver wishes to support. There are no restrictions - * on maps being of the same size, and it's perfectly legal to - * share maps between types. - */ struct blk_mq_queue_map map[HCTX_MAX_TYPES]; - unsigned int nr_maps; /* nr entries in map[] */ + unsigned int nr_maps; const struct blk_mq_ops *ops; - unsigned int nr_hw_queues; /* nr hw queues across maps */ - unsigned int queue_depth; /* max hw supported */ + unsigned int nr_hw_queues; + unsigned int queue_depth; unsigned int reserved_tags; - unsigned int cmd_size; /* per-request extra data */ + unsigned int cmd_size; int numa_node; unsigned int timeout; - unsigned int flags; /* BLK_MQ_F_* */ + unsigned int flags; void *driver_data; struct blk_mq_tags **tags; From 27a46989a82c71028f2ba15a3f2c8f30451fda33 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Sep 2019 11:25:49 +0300 Subject: [PATCH 07/32] blk-mq: Inline status checkers blk_mq_request_completed() and blk_mq_request_started() are short, inline it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ------------ block/blk-mq.h | 9 --------- include/linux/blk-mq.h | 20 ++++++++++++++++++-- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9d932939b576a..268a95e899b7f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -663,18 +663,6 @@ bool blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -int blk_mq_request_started(struct request *rq) -{ - return blk_mq_rq_state(rq) != MQ_RQ_IDLE; -} -EXPORT_SYMBOL_GPL(blk_mq_request_started); - -int blk_mq_request_completed(struct request *rq) -{ - return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; -} -EXPORT_SYMBOL_GPL(blk_mq_request_completed); - void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; diff --git a/block/blk-mq.h b/block/blk-mq.h index 32c62c64e6c2b..eaaca8fc1c287 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -128,15 +128,6 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_release(struct request_queue *q); -/** - * blk_mq_rq_state() - read the current MQ_RQ_* state of a request - * @rq: target request. - */ -static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) -{ - return READ_ONCE(rq->state); -} - static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a96b5cc957aba..e0fce93ac127d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -333,9 +333,25 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } +/** + * blk_mq_rq_state() - read the current MQ_RQ_* state of a request + * @rq: target request. + */ +static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) +{ + return READ_ONCE(rq->state); +} + +static inline int blk_mq_request_started(struct request *rq) +{ + return blk_mq_rq_state(rq) != MQ_RQ_IDLE; +} + +static inline int blk_mq_request_completed(struct request *rq) +{ + return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; +} -int blk_mq_request_started(struct request *rq); -int blk_mq_request_completed(struct request *rq); void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); From bb4e6b149103c285aeeba43a8141ea3b7009c0fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Sep 2019 21:55:33 +0300 Subject: [PATCH 08/32] blk-mq: Reuse callback in blk_mq_in_flight*() Reuse a more generic callback in both blk_mq_in_flight() and blk_mq_in_flight_rw(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 268a95e899b7f..19560a16a7744 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -102,11 +102,8 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - /* - * index[0] counts the specific partition that was asked for. - */ if (rq->part == mi->part) - mi->inflight[0]++; + mi->inflight[rq_data_dir(rq)]++; return true; } @@ -119,19 +116,7 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - return inflight[0]; -} - -static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, - bool reserved) -{ - struct mq_inflight *mi = priv; - - if (rq->part == mi->part) - mi->inflight[rq_data_dir(rq)]++; - - return true; + return inflight[0] + inflight[1]; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, @@ -140,7 +125,7 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, struct mq_inflight mi = { .part = part, .inflight = inflight, }; inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); } void blk_freeze_queue_start(struct request_queue *q) From a2e80f6f044526e00a788aa8e1ed72b1911f4534 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Sep 2019 21:55:34 +0300 Subject: [PATCH 09/32] blk-mq: Embed counters into struct mq_inflight Store inflight counters immediately in struct mq_inflight. That's type-safer and removes extra indirection. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 19560a16a7744..8538dc415499e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -93,7 +93,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct mq_inflight { struct hd_struct *part; - unsigned int *inflight; + unsigned int inflight[2]; }; static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, @@ -110,22 +110,21 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) { - unsigned inflight[2]; - struct mq_inflight mi = { .part = part, .inflight = inflight, }; + struct mq_inflight mi = { .part = part }; - inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - return inflight[0] + inflight[1]; + return mi.inflight[0] + mi.inflight[1]; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - struct mq_inflight mi = { .part = part, .inflight = inflight, }; + struct mq_inflight mi = { .part = part }; - inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + inflight[0] = mi.inflight[0]; + inflight[1] = mi.inflight[1]; } void blk_freeze_queue_start(struct request_queue *q) From 8148f0b5647a831c5d94b59da240c8e76dbacae9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 8 Oct 2019 00:16:51 +0300 Subject: [PATCH 10/32] blk-stat: Optimise blk_stat_add() blk_stat_add() calls {get,put}_cpu_ptr() in a loop, which entails overhead of disabling/enabling preemption. The loop is under RCU (i.e.short) anyway, so do get_cpu() in advance. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-stat.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/blk-stat.c b/block/blk-stat.c index 940f15d600f8a..7da302ff88d0d 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -53,7 +53,7 @@ void blk_stat_add(struct request *rq, u64 now) struct request_queue *q = rq->q; struct blk_stat_callback *cb; struct blk_rq_stat *stat; - int bucket; + int bucket, cpu; u64 value; value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; @@ -61,6 +61,7 @@ void blk_stat_add(struct request *rq, u64 now) blk_throtl_stat_add(rq, value); rcu_read_lock(); + cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { if (!blk_stat_is_active(cb)) continue; @@ -69,10 +70,10 @@ void blk_stat_add(struct request *rq, u64 now) if (bucket < 0) continue; - stat = &get_cpu_ptr(cb->cpu_stat)[bucket]; + stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket]; blk_rq_stat_add(stat, value); - put_cpu_ptr(cb->cpu_stat); } + put_cpu(); rcu_read_unlock(); } From 48d9b0d43105e0da2b7c135eedd24e51234fb5e4 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 10 Oct 2019 17:36:26 -0600 Subject: [PATCH 11/32] block: account statistics for passthrough requests Presently, passthrough requests are not accounted for because blk_do_io_stat() expressly rejects them. Based on some digging in the history, this doesn't seem like a concious decision but one that evolved from the change from blk_fs_request() to blk_rq_is_passthrough(). To support this, call blk_account_io_start() in blk_execute_rq_nowait() and remove the passthrough check in blk_do_io_stat(). Link: https://lore.kernel.org/linux-block/20191010100526.GA27209@lst.de/ Signed-off-by: Logan Gunthorpe Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-exec.c | 2 ++ block/blk.h | 7 ++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/block/blk-exec.c b/block/blk-exec.c index 1db44ca0f4a6d..e20a852ae432d 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -55,6 +55,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq->rq_disk = bd_disk; rq->end_io = done; + blk_account_io_start(rq, true); + /* * don't check dying flag for MQ because the request won't * be reused after dying flag is set diff --git a/block/blk.h b/block/blk.h index 47fba9362e60d..2bea40180b6fb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -242,14 +242,11 @@ int blk_dev_init(void); * Contribute to IO statistics IFF: * * a) it's attached to a gendisk, and - * b) the queue had IO stats enabled when this request was started, and - * c) it's a file system request + * b) the queue had IO stats enabled when this request was started */ static inline bool blk_do_io_stat(struct request *rq) { - return rq->rq_disk && - (rq->rq_flags & RQF_IO_STAT) && - !blk_rq_is_passthrough(rq); + return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); } static inline void req_set_nomerge(struct request_queue *q, struct request *req) From a9a808084d6ac015ac68b4223ac5f6ef31f8076e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Oct 2019 09:50:08 -0700 Subject: [PATCH 12/32] block: Remove the synchronize_rcu() call from __blk_mq_update_nr_hw_queues() Since the blk_mq_{,un}freeze_queue() calls in __blk_mq_update_nr_hw_queues() already serialize __blk_mq_update_nr_hw_queues() against blk_mq_queue_tag_busy_iter(), the synchronize_rcu() call in __blk_mq_update_nr_hw_queues() is not necessary. Hence remove it. Note: the synchronize_rcu() call in __blk_mq_update_nr_hw_queues() was introduced by commit f5bbbbe4d635 ("blk-mq: sync the update nr_hw_queues with blk_mq_queue_tag_busy_iter"). Commit 530ca2c9bd69 ("blk-mq: Allow blocking queue tag iter callbacks") removed the rcu_read_{,un}lock() calls that correspond to the synchronize_rcu() call in __blk_mq_update_nr_hw_queues(). Reviewed-by: Ming Lei Cc: Jianchao Wang Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8538dc415499e..7528678ef41fc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3242,10 +3242,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); - /* - * Sync with blk_mq_queue_tag_busy_iter. - */ - synchronize_rcu(); /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done From ac0d6b926e741f328b23c8af0134312af7c032d9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Oct 2019 09:50:09 -0700 Subject: [PATCH 13/32] block: Reduce the amount of memory required per request queue Instead of always allocating at least nr_cpu_ids hardware queues per request queue, reallocate q->queue_hw_ctx if it has to grow. This patch improves behavior that was introduced by commit 868f2f0b7206 ("blk-mq: dynamic h/w context count"). Cc: Keith Busch Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 7528678ef41fc..ba09cda49953f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2761,6 +2761,23 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int i, j, end; struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + q->queue_hw_ctx = new_hctxs; + q->nr_hw_queues = set->nr_hw_queues; + kfree(hctxs); + hctxs = new_hctxs; + } + /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { @@ -2848,12 +2865,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->queue_hw_ctx = kcalloc_node(nr_hw_queues(set), - sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, - set->numa_node); - if (!q->queue_hw_ctx) - goto err_sys_init; - INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); @@ -2901,7 +2912,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: kfree(q->queue_hw_ctx); q->nr_hw_queues = 0; -err_sys_init: blk_mq_sysfs_deinit(q); err_poll: blk_stat_free_callback(q->poll_cb); From f7e76dbc24df695f1b8e88ed3201be22215ec969 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Oct 2019 09:50:10 -0700 Subject: [PATCH 14/32] block: Reduce the amount of memory used for tag sets Instead of allocating an array of size nr_cpu_ids for set->tags, allocate an array of size set->nr_hw_queues. This patch improves behavior that was introduced by commit 868f2f0b7206 ("blk-mq: dynamic h/w context count"). Reallocating tag sets from inside __blk_mq_update_nr_hw_queues() is safe because: - All request queues that share the tag sets are frozen before the tag sets are reallocated. - blk_mq_queue_tag_busy_iter() holds q->q_usage_counter while active and hence is serialized against __blk_mq_update_nr_hw_queues(). Cc: Keith Busch Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ba09cda49953f..df41b2d162615 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2833,19 +2833,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -/* - * Maximum number of hardware queues we support. For single sets, we'll never - * have more than the CPUs (software queues). For multiple sets, the tag_set - * user may have set ->nr_hw_queues larger. - */ -static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) -{ - if (set->nr_maps == 1) - return nr_cpu_ids; - - return max(set->nr_hw_queues, nr_cpu_ids); -} - struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q, bool elevator_init) @@ -3012,6 +2999,29 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) } } +static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, + int cur_nr_hw_queues, int new_nr_hw_queues) +{ + struct blk_mq_tags **new_tags; + + if (cur_nr_hw_queues >= new_nr_hw_queues) + return 0; + + new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!new_tags) + return -ENOMEM; + + if (set->tags) + memcpy(new_tags, set->tags, cur_nr_hw_queues * + sizeof(*set->tags)); + kfree(set->tags); + set->tags = new_tags; + set->nr_hw_queues = new_nr_hw_queues; + + return 0; +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -3065,9 +3075,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), - GFP_KERNEL, set->numa_node); - if (!set->tags) + if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; @@ -3108,7 +3116,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; - for (i = 0; i < nr_hw_queues(set); i++) + for (i = 0; i < set->nr_hw_queues; i++) blk_mq_free_map_and_requests(set, i); for (j = 0; j < set->nr_maps; j++) { @@ -3266,6 +3274,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_sysfs_unregister(q); } + if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < + 0) + goto reregister; + prev_nr_hw_queues = set->nr_hw_queues; set->nr_hw_queues = nr_hw_queues; blk_mq_update_queue_map(set); @@ -3282,6 +3294,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_map_swqueue(q); } +reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register(q); blk_mq_debugfs_register_hctxs(q); From 993e4cdebb5a53bc87f21cdd34d1dc42225de43d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 24 Oct 2019 19:31:10 +0200 Subject: [PATCH 15/32] block: reorder bio::__bi_remaining for better packing Simple reordering of __bi_remaining can reduce bio size by 8 bytes that are now wasted on padding (measured on x86_64): struct bio { struct bio * bi_next; /* 0 8 */ struct gendisk * bi_disk; /* 8 8 */ unsigned int bi_opf; /* 16 4 */ short unsigned int bi_flags; /* 20 2 */ short unsigned int bi_ioprio; /* 22 2 */ short unsigned int bi_write_hint; /* 24 2 */ blk_status_t bi_status; /* 26 1 */ u8 bi_partno; /* 27 1 */ /* XXX 4 bytes hole, try to pack */ struct bvec_iter bi_iter; /* 32 24 */ /* XXX last struct has 4 bytes of padding */ atomic_t __bi_remaining; /* 56 4 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 104, cachelines: 2, members: 19 */ /* sum members: 96, holes: 2, sum holes: 8 */ /* paddings: 1, sum paddings: 4 */ /* last cacheline: 40 bytes */ }; Now becomes: struct bio { struct bio * bi_next; /* 0 8 */ struct gendisk * bi_disk; /* 8 8 */ unsigned int bi_opf; /* 16 4 */ short unsigned int bi_flags; /* 20 2 */ short unsigned int bi_ioprio; /* 22 2 */ short unsigned int bi_write_hint; /* 24 2 */ blk_status_t bi_status; /* 26 1 */ u8 bi_partno; /* 27 1 */ atomic_t __bi_remaining; /* 28 4 */ struct bvec_iter bi_iter; /* 32 24 */ /* XXX last struct has 4 bytes of padding */ [...] /* size: 96, cachelines: 2, members: 19 */ /* paddings: 1, sum paddings: 4 */ /* last cacheline: 32 bytes */ }; Signed-off-by: David Sterba Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d688b96d1d633..1e7eeec16458a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -153,10 +153,10 @@ struct bio { unsigned short bi_write_hint; blk_status_t bi_status; u8 bi_partno; + atomic_t __bi_remaining; struct bvec_iter bi_iter; - atomic_t __bi_remaining; bio_end_io_t *bi_end_io; void *bi_private; From 1fead7182f381ab0ebab5eaf1a060a15550da994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Fri, 25 Oct 2019 14:16:51 -0600 Subject: [PATCH 16/32] blk-mq: remove needless goto from blk_mq_get_driver_tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only usage of the label "done" is when (rq->tag != -1) at the beginning of the function. Rather than jumping to label, we can just remove this label and execute the code at the "if". Besides that, the code that would be executed after the label "done" is the return of the logical expression (rq->tag != -1) but since we are already inside the if, we now that this is true. Remove the label and replace the goto with the proper result of the label. Signed-off-by: André Almeida Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index df41b2d162615..c0f3357a20506 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1036,7 +1036,7 @@ bool blk_mq_get_driver_tag(struct request *rq) bool shared; if (rq->tag != -1) - goto done; + return true; if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; @@ -1051,7 +1051,6 @@ bool blk_mq_get_driver_tag(struct request *rq) data.hctx->tags->rqs[rq->tag] = rq; } -done: return rq->tag != -1; } From d386732bc142c63b9f676fed098bc06f91ee964a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 21 Oct 2019 21:07:24 -0300 Subject: [PATCH 17/32] blk-mq: fill header with kernel-doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Insert documentation for structs, enums and functions at header file. Format existing and new comments at struct blk_mq_ops as kernel-doc comments. Reviewed-by: Bart Van Assche Signed-off-by: André Almeida Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 225 +++++++++++++++++++++++++++++++++-------- 1 file changed, 185 insertions(+), 40 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e0fce93ac127d..4919d22e1aff2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -10,74 +10,171 @@ struct blk_mq_tags; struct blk_flush_queue; /** - * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware block device + * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware + * block device */ struct blk_mq_hw_ctx { struct { + /** @lock: Protects the dispatch list. */ spinlock_t lock; + /** + * @dispatch: Used for requests that are ready to be + * dispatched to the hardware but for some reason (e.g. lack of + * resources) could not be sent to the hardware. As soon as the + * driver can send new requests, requests at this list will + * be sent first for a fairer dispatch. + */ struct list_head dispatch; - unsigned long state; /* BLK_MQ_S_* flags */ + /** + * @state: BLK_MQ_S_* flags. Defines the state of the hw + * queue (active, scheduled to restart, stopped). + */ + unsigned long state; } ____cacheline_aligned_in_smp; + /** + * @run_work: Used for scheduling a hardware queue run at a later time. + */ struct delayed_work run_work; + /** @cpumask: Map of available CPUs where this hctx can run. */ cpumask_var_t cpumask; + /** + * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU + * selection from @cpumask. + */ int next_cpu; + /** + * @next_cpu_batch: Counter of how many works left in the batch before + * changing to the next CPU. + */ int next_cpu_batch; - unsigned long flags; /* BLK_MQ_F_* flags */ + /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ + unsigned long flags; + /** + * @sched_data: Pointer owned by the IO scheduler attached to a request + * queue. It's up to the IO scheduler how to use this pointer. + */ void *sched_data; + /** + * @queue: Pointer to the request queue that owns this hardware context. + */ struct request_queue *queue; + /** @fq: Queue of requests that need to perform a flush operation. */ struct blk_flush_queue *fq; + /** + * @driver_data: Pointer to data owned by the block driver that created + * this hctx + */ void *driver_data; + /** + * @ctx_map: Bitmap for each software queue. If bit is on, there is a + * pending request in that software queue. + */ struct sbitmap ctx_map; + /** + * @dispatch_from: Software queue to be used when no scheduler was + * selected. + */ struct blk_mq_ctx *dispatch_from; + /** + * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to + * decide if the hw_queue is busy using Exponential Weighted Moving + * Average algorithm. + */ unsigned int dispatch_busy; + /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ unsigned short type; + /** @nr_ctx: Number of software queues. */ unsigned short nr_ctx; + /** @ctxs: Array of software queues. */ struct blk_mq_ctx **ctxs; + /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ spinlock_t dispatch_wait_lock; + /** + * @dispatch_wait: Waitqueue to put requests when there is no tag + * available at the moment, to wait for another try in the future. + */ wait_queue_entry_t dispatch_wait; + + /** + * @wait_index: Index of next available dispatch_wait queue to insert + * requests. + */ atomic_t wait_index; + /** + * @tags: Tags owned by the block driver. A tag at this set is only + * assigned when a request is dispatched from a hardware queue. + */ struct blk_mq_tags *tags; + /** + * @sched_tags: Tags owned by I/O scheduler. If there is an I/O + * scheduler associated with a request queue, a tag is assigned when + * that request is allocated. Else, this member is not used. + */ struct blk_mq_tags *sched_tags; + /** @queued: Number of queued requests. */ unsigned long queued; + /** @run: Number of dispatched requests. */ unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 + /** @dispatched: Number of dispatch requests by queue. */ unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; + /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; + /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; + /** + * @nr_active: Number of active requests. Only used when a tag set is + * shared across request queues. + */ atomic_t nr_active; + /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; + /** @kobj: Kernel object for sysfs. */ struct kobject kobj; + /** @poll_considered: Count times blk_poll() was called. */ unsigned long poll_considered; + /** @poll_invoked: Count how many requests blk_poll() polled. */ unsigned long poll_invoked; + /** @poll_success: Count how many polled requests were completed. */ unsigned long poll_success; #ifdef CONFIG_BLK_DEBUG_FS + /** + * @debugfs_dir: debugfs directory for this hardware queue. Named + * as cpu. + */ struct dentry *debugfs_dir; + /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; #endif + /** @hctx_list: List of all hardware queues. */ struct list_head hctx_list; - /* Must be the last member - see also blk_mq_hw_ctx_size(). */ + /** + * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is + * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also + * blk_mq_hw_ctx_size(). + */ struct srcu_struct srcu[0]; }; /** - * struct blk_mq_queue_map - ctx -> hctx mapping + * struct blk_mq_queue_map - Map software queues to hardware queues * @mq_map: CPU ID to hardware queue index map. This is an array * with nr_cpu_ids elements. Each element has a value in the range * [@queue_offset, @queue_offset + @nr_queues). @@ -92,10 +189,17 @@ struct blk_mq_queue_map { unsigned int queue_offset; }; +/** + * enum hctx_type - Type of hardware queue + * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. + * @HCTX_TYPE_READ: Just for READ I/O. + * @HCTX_TYPE_POLL: Polled I/O of any kind. + * @HCTX_MAX_TYPES: Number of types of hctx. + */ enum hctx_type { - HCTX_TYPE_DEFAULT, /* all I/O not otherwise accounted for */ - HCTX_TYPE_READ, /* just for READ I/O */ - HCTX_TYPE_POLL, /* polled I/O of any kind */ + HCTX_TYPE_DEFAULT, + HCTX_TYPE_READ, + HCTX_TYPE_POLL, HCTX_MAX_TYPES, }; @@ -147,6 +251,12 @@ struct blk_mq_tag_set { struct list_head tag_list; }; +/** + * struct blk_mq_queue_data - Data about a request inserted in a queue + * + * @rq: Request pointer. + * @last: If it is the last request in the queue. + */ struct blk_mq_queue_data { struct request *rq; bool last; @@ -174,81 +284,101 @@ typedef bool (busy_fn)(struct request_queue *); typedef void (complete_fn)(struct request *); typedef void (cleanup_rq_fn)(struct request *); - +/** + * struct blk_mq_ops - Callback functions that implements block driver + * behaviour. + */ struct blk_mq_ops { - /* - * Queue request + /** + * @queue_rq: Queue a new request from block IO. */ queue_rq_fn *queue_rq; - /* - * If a driver uses bd->last to judge when to submit requests to - * hardware, it must define this function. In case of errors that - * make us stop issuing further requests, this hook serves the + /** + * @commit_rqs: If a driver uses bd->last to judge when to submit + * requests to hardware, it must define this function. In case of errors + * that make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done). */ commit_rqs_fn *commit_rqs; - /* - * Reserve budget before queue request, once .queue_rq is + /** + * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock. */ get_budget_fn *get_budget; + /** + * @put_budget: Release the reserved budget. + */ put_budget_fn *put_budget; - /* - * Called on request timeout + /** + * @timeout: Called on request timeout. */ timeout_fn *timeout; - /* - * Called to poll for completion of a specific tag. + /** + * @poll: Called to poll for completion of a specific tag. */ poll_fn *poll; + /** + * @complete: Mark the request as complete. + */ complete_fn *complete; - /* - * Called when the block layer side of a hardware queue has been - * set up, allowing the driver to allocate/init matching structures. - * Ditto for exit/teardown. + /** + * @init_hctx: Called when the block layer side of a hardware queue has + * been set up, allowing the driver to allocate/init matching + * structures. */ init_hctx_fn *init_hctx; + /** + * @exit_hctx: Ditto for exit/teardown. + */ exit_hctx_fn *exit_hctx; - /* - * Called for every command allocated by the block layer to allow - * the driver to set up driver specific data. + /** + * @init_request: Called for every command allocated by the block layer + * to allow the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request. - * - * Ditto for exit/teardown. */ init_request_fn *init_request; + /** + * @exit_request: Ditto for exit/teardown. + */ exit_request_fn *exit_request; - /* Called from inside blk_get_request() */ + + /** + * @initialize_rq_fn: Called from inside blk_get_request(). + */ void (*initialize_rq_fn)(struct request *rq); - /* - * Called before freeing one request which isn't completed yet, - * and usually for freeing the driver private data + /** + * @cleanup_rq: Called before freeing one request which isn't completed + * yet, and usually for freeing the driver private data. */ cleanup_rq_fn *cleanup_rq; - /* - * If set, returns whether or not this queue currently is busy + /** + * @busy: If set, returns whether or not this queue currently is busy. */ busy_fn *busy; + /** + * @map_queues: This allows drivers specify their own queue mapping by + * overriding the setup-time function that builds the mq_map. + */ map_queues_fn *map_queues; #ifdef CONFIG_BLK_DEBUG_FS - /* - * Used by the debugfs implementation to show driver-specific + /** + * @show_rq: Used by the debugfs implementation to show driver-specific * information about a request. */ void (*show_rq)(struct seq_file *m, struct request *rq); @@ -391,14 +521,29 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q); unsigned int blk_mq_rq_cpu(struct request *rq); -/* +/** + * blk_mq_rq_from_pdu - cast a PDU to a request + * @pdu: the PDU (Protocol Data Unit) to be casted + * + * Return: request + * * Driver command data is immediately after the request. So subtract request - * size to get back to the original request, add request size to get the PDU. + * size to get back to the original request. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { return pdu - sizeof(struct request); } + +/** + * blk_mq_rq_to_pdu - cast a request to a PDU + * @rq: the request to be casted + * + * Return: pointer to the PDU + * + * Driver command data is immediately after the request. So add request to get + * the PDU. + */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { return rq + 1; From 9a7f12edf8848a9b355a86fc0183d7be932ef11e Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Fri, 20 Sep 2019 17:58:21 +0200 Subject: [PATCH 18/32] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint name According to commit message in the original commit c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life time hints"), as well as userspace library[1] and man page update[2], R/W hint constants are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained "RWF_*" prefix used in the early versions of the proposed patch set[3]. Rename it and provide the old name as a synonym for the new one for backward compatibility. [1] https://github.com/axboe/fio/commit/bd553af6c849 [2] https://github.com/mkerrisk/man-pages/commit/580082a186fd [3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life time hints") Acked-by: Song Liu Signed-off-by: Eugene Syromiatnikov Signed-off-by: Jens Axboe --- fs/fcntl.c | 2 +- include/uapi/linux/fcntl.h | 9 ++++++++- tools/include/uapi/linux/fcntl.h | 9 ++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 3d40771e8e7cf..41b6438bd2d9a 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) static bool rw_hint_valid(enum rw_hint hint) { switch (hint) { - case RWF_WRITE_LIFE_NOT_SET: + case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: case RWH_WRITE_LIFE_SHORT: case RWH_WRITE_LIFE_MEDIUM: diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 1d338357df8ab..1f97b33c840e0 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -58,13 +58,20 @@ * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be * used to clear any hints previously set. */ -#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NOT_SET 0 #define RWH_WRITE_LIFE_NONE 1 #define RWH_WRITE_LIFE_SHORT 2 #define RWH_WRITE_LIFE_MEDIUM 3 #define RWH_WRITE_LIFE_LONG 4 #define RWH_WRITE_LIFE_EXTREME 5 +/* + * The originally introduced spelling is remained from the first + * versions of the patch set that introduced the feature, see commit + * v4.13-rc1~212^2~51. + */ +#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET + /* * Types of directory notifications that may be requested. */ diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h index 1d338357df8ab..1f97b33c840e0 100644 --- a/tools/include/uapi/linux/fcntl.h +++ b/tools/include/uapi/linux/fcntl.h @@ -58,13 +58,20 @@ * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be * used to clear any hints previously set. */ -#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NOT_SET 0 #define RWH_WRITE_LIFE_NONE 1 #define RWH_WRITE_LIFE_SHORT 2 #define RWH_WRITE_LIFE_MEDIUM 3 #define RWH_WRITE_LIFE_LONG 4 #define RWH_WRITE_LIFE_EXTREME 5 +/* + * The originally introduced spelling is remained from the first + * versions of the patch set that introduced the feature, see commit + * v4.13-rc1~212^2~51. + */ +#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET + /* * Types of directory notifications that may be requested. */ From 626fb735a43ddcb7b2c58c27cb03b098acc03339 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 30 Oct 2019 00:59:30 +0800 Subject: [PATCH 19/32] blk-mq: Make blk_mq_run_hw_queue() return void Since commit 97889f9ac24f ("blk-mq: remove synchronize_rcu() from blk_mq_del_queue_tag_set()"), the return value of blk_mq_run_hw_queue() is never checked, so make it return void, which very marginally simplifies the code. Reviewed-by: Bob Liu Signed-off-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++------ include/linux/blk-mq.h | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c0f3357a20506..5c9adcaa27ac4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1457,7 +1457,7 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); -bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { int srcu_idx; bool need_run; @@ -1475,12 +1475,8 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) blk_mq_hctx_has_pending(hctx); hctx_unlock(hctx, srcu_idx); - if (need_run) { + if (need_run) __blk_mq_delay_run_hw_queue(hctx, async, 0); - return true; - } - - return false; } EXPORT_SYMBOL(blk_mq_run_hw_queue); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4919d22e1aff2..dc03e059fdff0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -502,7 +502,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); From 8962842ca5abdcf98e22ab3b2b45a103f0408b95 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 2 Nov 2019 16:02:15 +0800 Subject: [PATCH 20/32] blk-mq: avoid sysfs buffer overflow with too many CPU cores It is reported that sysfs buffer overflow can be triggered if the system has too many CPU cores(>841 on 4K PAGE_SIZE) when showing CPUs of hctx via /sys/block/$DEV/mq/$N/cpu_list. Use snprintf to avoid the potential buffer overflow. This version doesn't change the attribute format, and simply stops showing CPU numbers if the buffer is going to overflow. Cc: stable@vger.kernel.org Fixes: 676141e48af7("blk-mq: don't dump CPU -> hw queue map on driver load") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 81a273b433298..4caa56d1bc857 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -158,20 +158,25 @@ static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { + const size_t size = PAGE_SIZE - 1; unsigned int i, first = 1; - ssize_t ret = 0; + int ret = 0, pos = 0; for_each_cpu(i, hctx->cpumask) { if (first) - ret += sprintf(ret + page, "%u", i); + ret = snprintf(pos + page, size - pos, "%u", i); else - ret += sprintf(ret + page, ", %u", i); + ret = snprintf(pos + page, size - pos, ", %u", i); + + if (ret >= size - pos) + break; first = 0; + pos += ret; } - ret += sprintf(ret + page, "\n"); - return ret; + ret = snprintf(pos + page, size - pos, "\n"); + return pos + ret; } static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { From 731dc4868311ee097757b8746eaa1b4f8b2b4f1c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 21 Oct 2019 10:37:59 +0200 Subject: [PATCH 21/32] bdev: Factor out bdev revalidation into a common helper Factor out code handling revalidation of bdev on disk change into a common helper. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/block_dev.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 9c073dbdc1b04..88c6d35ec71d7 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1512,6 +1512,14 @@ EXPORT_SYMBOL(bd_set_size); static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); +static void bdev_disk_changed(struct block_device *bdev, bool invalidate) +{ + if (invalidate) + invalidate_partitions(bdev->bd_disk, bdev); + else + rescan_partitions(bdev->bd_disk, bdev); +} + /* * bd_mutex locking: * @@ -1594,12 +1602,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated) { - if (!ret) - rescan_partitions(disk, bdev); - else if (ret == -ENOMEDIUM) - invalidate_partitions(disk, bdev); - } + if (bdev->bd_invalidated && + (!ret || ret == -ENOMEDIUM)) + bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) goto out_clear; @@ -1632,12 +1637,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated) { - if (!ret) - rescan_partitions(bdev->bd_disk, bdev); - else if (ret == -ENOMEDIUM) - invalidate_partitions(bdev->bd_disk, bdev); - } + if (bdev->bd_invalidated && + (!ret || ret == -ENOMEDIUM)) + bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) goto out_unlock_bdev; } From cba22d86e0a10b7070d2e6a7379dbea51aa0883c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 21 Oct 2019 10:38:00 +0200 Subject: [PATCH 22/32] bdev: Refresh bdev size for disks without partitioning Currently, block device size in not updated on second and further open for block devices where partition scan is disabled. This is particularly annoying for example for DVD drives as that means block device size does not get updated once the media is inserted into a drive if the device is already open when inserting the media. This is actually always the case for example when pktcdvd is in use. Fix the problem by revalidating block device size on every open even for devices with partition scan disabled. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/block_dev.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 88c6d35ec71d7..d612468ee66bf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1403,11 +1403,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) "resized disk %s\n", bdev->bd_disk ? bdev->bd_disk->disk_name : ""); } - - if (!bdev->bd_disk) - return; - if (disk_part_scan_enabled(bdev->bd_disk)) - bdev->bd_invalidated = 1; + bdev->bd_invalidated = 1; } /** @@ -1514,10 +1510,15 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static void bdev_disk_changed(struct block_device *bdev, bool invalidate) { - if (invalidate) - invalidate_partitions(bdev->bd_disk, bdev); - else - rescan_partitions(bdev->bd_disk, bdev); + if (disk_part_scan_enabled(bdev->bd_disk)) { + if (invalidate) + invalidate_partitions(bdev->bd_disk, bdev); + else + rescan_partitions(bdev->bd_disk, bdev); + } else { + check_disk_size_change(bdev->bd_disk, bdev, !invalidate); + bdev->bd_invalidated = 0; + } } /* From 3495ea1b5f6083b03af062095eecb37283a2cc8f Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 31 Oct 2019 10:13:20 -0600 Subject: [PATCH 23/32] block: sed-opal: Generalizing write data to any opal table This patch refactors the existing "write_shadowmbr" func and creates a new generalized function "generic_table_write_data", to write data to any opal table. Also, a few cleanups are included in this patch. Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/sed-opal.c | 138 +++++++++++++++++++++++++---------------------- 1 file changed, 74 insertions(+), 64 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index b4c761973ac10..d6e2ec0d8a3a6 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1139,11 +1139,11 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, * * the result is provided in dev->resp->tok[4] */ -static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, +static int generic_get_table_info(struct opal_dev *dev, const u8 *table_uid, u64 column) { u8 uid[OPAL_UID_LENGTH]; - const unsigned int half = OPAL_UID_LENGTH/2; + const unsigned int half = OPAL_UID_LENGTH_HALF; /* sed-opal UIDs can be split in two halves: * first: actual table index @@ -1152,7 +1152,7 @@ static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, * first part of the target table as relative index into that table */ memcpy(uid, opaluid[OPAL_TABLE_TABLE], half); - memcpy(uid+half, opaluid[table], half); + memcpy(uid + half, table_uid, half); return generic_get_column(dev, uid, column); } @@ -1221,6 +1221,75 @@ static int get_active_key(struct opal_dev *dev, void *data) return get_active_key_cont(dev); } +static int generic_table_write_data(struct opal_dev *dev, const u64 data, + u64 offset, u64 size, const u8 *uid) +{ + const u8 __user *src = (u8 __user *)(uintptr_t)data; + u8 *dst; + u64 len; + size_t off = 0; + int err; + + /* do we fit in the available space? */ + err = generic_get_table_info(dev, uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + len = response_get_u64(&dev->parsed, 4); + if (size > len || offset > len - size) { + pr_debug("Does not fit in the table (%llu vs. %llu)\n", + offset + size, len); + return -ENOSPC; + } + + /* do the actual transmission(s) */ + while (off < size) { + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WHERE); + add_token_u64(&err, dev, offset + off); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + /* + * The bytestring header is either 1 or 2 bytes, so assume 2. + * There also needs to be enough space to accommodate the + * trailing OPAL_ENDNAME (1 byte) and tokens added by + * cmd_finalize. + */ + len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), + (size_t)(size - off)); + pr_debug("Write bytes %zu+%llu/%llu\n", off, len, size); + + dst = add_bytestring_header(&err, dev, len); + if (!dst) + break; + + if (copy_from_user(dst, src + off, len)) { + err = -EFAULT; + break; + } + + dev->pos += len; + + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) + break; + + err = finalize_and_send(dev, parse_and_check_status); + if (err) + break; + + off += len; + } + + return err; +} + static int generic_lr_enable_disable(struct opal_dev *dev, u8 *uid, bool rle, bool wle, bool rl, bool wl) @@ -1583,68 +1652,9 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data) static int write_shadow_mbr(struct opal_dev *dev, void *data) { struct opal_shadow_mbr *shadow = data; - const u8 __user *src; - u8 *dst; - size_t off = 0; - u64 len; - int err = 0; - - /* do we fit in the available shadow mbr space? */ - err = generic_get_table_info(dev, OPAL_MBR, OPAL_TABLE_ROWS); - if (err) { - pr_debug("MBR: could not get shadow size\n"); - return err; - } - - len = response_get_u64(&dev->parsed, 4); - if (shadow->size > len || shadow->offset > len - shadow->size) { - pr_debug("MBR: does not fit in shadow (%llu vs. %llu)\n", - shadow->offset + shadow->size, len); - return -ENOSPC; - } - - /* do the actual transmission(s) */ - src = (u8 __user *)(uintptr_t)shadow->data; - while (off < shadow->size) { - err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]); - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_WHERE); - add_token_u64(&err, dev, shadow->offset + off); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_VALUES); - - /* - * The bytestring header is either 1 or 2 bytes, so assume 2. - * There also needs to be enough space to accommodate the - * trailing OPAL_ENDNAME (1 byte) and tokens added by - * cmd_finalize. - */ - len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), - (size_t)(shadow->size - off)); - pr_debug("MBR: write bytes %zu+%llu/%llu\n", - off, len, shadow->size); - - dst = add_bytestring_header(&err, dev, len); - if (!dst) - break; - if (copy_from_user(dst, src + off, len)) - err = -EFAULT; - dev->pos += len; - add_token_u8(&err, dev, OPAL_ENDNAME); - if (err) - break; - - err = finalize_and_send(dev, parse_and_check_status); - if (err) - break; - - off += len; - } - - return err; + return generic_table_write_data(dev, shadow->data, shadow->offset, + shadow->size, opaluid[OPAL_MBR]); } static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, From 51f421c85c880dcb37df11e672b384eaa4444328 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 31 Oct 2019 10:13:21 -0600 Subject: [PATCH 24/32] block: sed-opal: Add support to read/write opal tables generically This feature gives the user RW access to any opal table with admin1 authority. The flags described in the new structure determines if the user wants to read/write the data. Flags are checked for valid values in order to allow future features to be added to the ioctl. The user can provide the desired table's UID. Also, the ioctl provides a size and offset field and internally will loop data accesses to return the full data block. Read overrun is prevented by the initiator's sec_send_recv() backend. The ioctl provides a private field with the intention to accommodate any future expansions to the ioctl. Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 - block/sed-opal.c | 172 ++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 20 ++++ 4 files changed, 193 insertions(+), 1 deletion(-) diff --git a/block/opal_proto.h b/block/opal_proto.h index 5532412d567c0..710911864e1d4 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -76,7 +76,6 @@ enum opal_response_token { * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Section: 6.3 Assigned UIDs */ -#define OPAL_UID_LENGTH 8 #define OPAL_METHOD_LENGTH 8 #define OPAL_MSID_KEYLEN 15 #define OPAL_UID_LENGTH_HALF 4 diff --git a/block/sed-opal.c b/block/sed-opal.c index d6e2ec0d8a3a6..1d5afaaf08267 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1967,6 +1967,113 @@ static int get_msid_cpin_pin(struct opal_dev *dev, void *data) return 0; } +static int write_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *write_tbl = data; + + return generic_table_write_data(dev, write_tbl->data, write_tbl->offset, + write_tbl->size, write_tbl->table_uid); +} + +static int read_table_data_cont(struct opal_dev *dev) +{ + int err; + const char *data_read; + + err = parse_and_check_status(dev); + if (err) + return err; + + dev->prev_d_len = response_get_string(&dev->parsed, 1, &data_read); + dev->prev_data = (void *)data_read; + if (!dev->prev_data) { + pr_debug("%s: Couldn't read data from the table.\n", __func__); + return OPAL_INVAL_PARAM; + } + + return 0; +} + +/* + * IO_BUFFER_LENGTH = 2048 + * sizeof(header) = 56 + * No. of Token Bytes in the Response = 11 + * MAX size of data that can be carried in response buffer + * at a time is : 2048 - (56 + 11) = 1981 = 0x7BD. + */ +#define OPAL_MAX_READ_TABLE (0x7BD) + +static int read_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *read_tbl = data; + int err; + size_t off = 0, max_read_size = OPAL_MAX_READ_TABLE; + u64 table_len, len; + u64 offset = read_tbl->offset, read_size = read_tbl->size - 1; + u8 __user *dst; + + err = generic_get_table_info(dev, read_tbl->table_uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + table_len = response_get_u64(&dev->parsed, 4); + + /* Check if the user is trying to read from the table limits */ + if (read_size > table_len || offset > table_len - read_size) { + pr_debug("Read size exceeds the Table size limits (%llu vs. %llu)\n", + offset + read_size, table_len); + return -EINVAL; + } + + while (off < read_size) { + err = cmd_start(dev, read_tbl->table_uid, opalmethod[OPAL_GET]); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_STARTROW); + add_token_u64(&err, dev, offset + off); /* start row value */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_ENDROW); + + len = min(max_read_size, (size_t)(read_size - off)); + add_token_u64(&err, dev, offset + off + len); /* end row value + */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_debug("Error building read table data command.\n"); + break; + } + + err = finalize_and_send(dev, read_table_data_cont); + if (err) + break; + + /* len+1: This includes the NULL terminator at the end*/ + if (dev->prev_d_len > len + 1) { + err = -EOVERFLOW; + break; + } + + dst = (u8 __user *)(uintptr_t)read_tbl->data; + if (copy_to_user(dst + off, dev->prev_data, dev->prev_d_len)) { + pr_debug("Error copying data to userspace\n"); + err = -EFAULT; + break; + } + dev->prev_data = NULL; + + off += len; + } + + return err; +} + static int end_opal_session(struct opal_dev *dev, void *data) { int err = 0; @@ -2453,6 +2560,68 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) } EXPORT_SYMBOL(opal_unlock_from_suspend); +static int opal_read_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step read_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { read_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, read_table_steps, + ARRAY_SIZE(read_table_steps)); +} + +static int opal_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step write_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { write_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, write_table_steps, + ARRAY_SIZE(write_table_steps)); +} + +static int opal_generic_read_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + int ret, bit_set; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + + bit_set = fls64(rw_tbl->flags) - 1; + switch (bit_set) { + case OPAL_READ_TABLE: + ret = opal_read_table(dev, rw_tbl); + break; + case OPAL_WRITE_TABLE: + ret = opal_write_table(dev, rw_tbl); + break; + default: + pr_debug("Invalid bit set in the flag (%016llx).\n", + rw_tbl->flags); + ret = -EINVAL; + break; + } + + mutex_unlock(&dev->dev_lock); + + return ret; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -2515,6 +2684,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_PSID_REVERT_TPR: ret = opal_reverttper(dev, p, true); break; + case IOC_OPAL_GENERIC_TABLE_RW: + ret = opal_generic_read_write_table(dev, p); + break; default: break; } diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 53c28d750a45b..1ac0d712a9c36 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -42,6 +42,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_PSID_REVERT_TPR: case IOC_OPAL_MBR_DONE: case IOC_OPAL_WRITE_SHADOW_MBR: + case IOC_OPAL_GENERIC_TABLE_RW: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index c6d035fa1b6ca..6f5af1a84213b 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -113,6 +113,25 @@ struct opal_shadow_mbr { __u64 size; }; +/* Opal table operations */ +enum opal_table_ops { + OPAL_READ_TABLE, + OPAL_WRITE_TABLE, +}; + +#define OPAL_UID_LENGTH 8 +struct opal_read_write_table { + struct opal_key key; + const __u64 data; + const __u8 table_uid[OPAL_UID_LENGTH]; + __u64 offset; + __u64 size; +#define OPAL_TABLE_READ (1 << OPAL_READ_TABLE) +#define OPAL_TABLE_WRITE (1 << OPAL_WRITE_TABLE) + __u64 flags; + __u64 priv; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -128,5 +147,6 @@ struct opal_shadow_mbr { #define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) #define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) +#define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table) #endif /* _UAPI_SED_OPAL_H */ From 62c441c6ae054a0f9ff2944908ed09603b035fd3 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 31 Oct 2019 10:13:22 -0600 Subject: [PATCH 25/32] block: sed-opal: Introduce Opal Datastore UID This patch introduces Opal Datastore UID. The generic read/write table ioctl can use this UID to access the Opal Datastore. Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/block/opal_proto.h b/block/opal_proto.h index 710911864e1d4..736e67c3e7c53 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -107,6 +107,7 @@ enum opal_uid { OPAL_C_PIN_TABLE, OPAL_LOCKING_INFO_TABLE, OPAL_ENTERPRISE_LOCKING_INFO_TABLE, + OPAL_DATASTORE, /* C_PIN_TABLE object ID's */ OPAL_C_PIN_MSID, OPAL_C_PIN_SID, diff --git a/block/sed-opal.c b/block/sed-opal.c index 1d5afaaf08267..b2cacc9ddd114 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -149,6 +149,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] = { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_DATASTORE] = + { 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00 }, /* C_PIN_TABLE object ID's */ [OPAL_C_PIN_MSID] = From d2c9be89f8ebe7ebcc97676ac40f8dec1cf9b43a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 4 Nov 2019 16:26:53 +0800 Subject: [PATCH 26/32] blk-mq: make sure that line break can be printed 8962842ca5ab ("blk-mq: avoid sysfs buffer overflow with too many CPU cores") avoids sysfs buffer overflow, and reserves one character for line break. However, the last snprintf() doesn't get correct 'size' parameter passed in, so fixed it. Fixes: 8962842ca5ab ("blk-mq: avoid sysfs buffer overflow with too many CPU cores") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 4caa56d1bc857..062229395a507 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -175,7 +175,7 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) pos += ret; } - ret = snprintf(pos + page, size - pos, "\n"); + ret = snprintf(pos + page, size + 1 - pos, "\n"); return pos + ret; } From fa53228721876515adabc7bc74368490bd97aa3b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2019 10:05:43 -0800 Subject: [PATCH 27/32] block: avoid blk_bio_segment_split for small I/O operations __blk_queue_split() adds significant overhead for small I/O operations. Add a shortcut to avoid it for cases where we know we never need to split. Based on a patch from Ming Lei. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 48e6725b32eea..f22cb6251d062 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -293,7 +293,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, void __blk_queue_split(struct request_queue *q, struct bio **bio, unsigned int *nr_segs) { - struct bio *split; + struct bio *split = NULL; switch (bio_op(*bio)) { case REQ_OP_DISCARD: @@ -309,6 +309,20 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, nr_segs); break; default: + /* + * All drivers must accept single-segments bios that are <= + * PAGE_SIZE. This is a quick and dirty check that relies on + * the fact that bi_io_vec[0] is always valid if a bio has data. + * The check might lead to occasional false negatives when bios + * are cloned, but compared to the performance impact of cloned + * bios themselves the loop below doesn't matter anyway. + */ + if (!q->limits.chunk_sectors && + (*bio)->bi_vcnt == 1 && + (*bio)->bi_io_vec[0].bv_len <= PAGE_SIZE) { + *nr_segs = 1; + break; + } split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break; } From f8db383507d658c5a729b062c97710efda876cd4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Nov 2019 11:48:57 +0100 Subject: [PATCH 28/32] block: Warn if elevator= parameter is used With transition to blk-mq, the elevator= kernel argument was removed as it makes less and less sense with the current variety of devices. Since this may surprise some users and there are advices on the Internet that still suggest to use it, let's at least warn if the parameter is used. Reviewed-by: Jeff Moyer Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/elevator.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/elevator.c b/block/elevator.c index 5437059c92615..0b1db9afb5867 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -831,3 +831,12 @@ struct request *elv_rb_latter_request(struct request_queue *q, return NULL; } EXPORT_SYMBOL(elv_rb_latter_request); + +static int __init elevator_setup(char *str) +{ + pr_warn("Kernel parameter elevator= does not have any effect anymore.\n" + "Please use sysfs to set IO scheduler for individual devices.\n"); + return 1; +} + +__setup("elevator=", elevator_setup); From a84324d2ed05c06b46fb6079d39a0736bde6e16a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 27 Oct 2019 23:05:42 +0900 Subject: [PATCH 29/32] block: Remove REQ_OP_ZONE_RESET plugging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit REQ_OP_ZONE_RESET operations cannot be merged as these bios and requests do not have a size and are never sequential due to the zone start sector position required for their execution. As a result, there is no point in using a plug around blkdev_reset_zones() bio issuing loop. This patch removes this unnecessary plugging. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 4bc5f260248a6..7fe376eede86a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -258,7 +258,6 @@ int blkdev_reset_zones(struct block_device *bdev, sector_t zone_sectors; sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; - struct blk_plug plug; int ret; if (!blk_queue_is_zoned(q)) @@ -283,7 +282,6 @@ int blkdev_reset_zones(struct block_device *bdev, end_sector != bdev->bd_part->nr_sects) return -EINVAL; - blk_start_plug(&plug); while (sector < end_sector) { bio = blk_next_bio(bio, 0, gfp_mask); @@ -301,8 +299,6 @@ int blkdev_reset_zones(struct block_device *bdev, ret = submit_bio_wait(bio); bio_put(bio); - blk_finish_plug(&plug); - return ret; } EXPORT_SYMBOL_GPL(blkdev_reset_zones); From c7a1d926dc4076aadad187614500afcd8de78818 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 27 Oct 2019 23:05:43 +0900 Subject: [PATCH 30/32] block: Simplify REQ_OP_ZONE_RESET_ALL handling There is no need for the function __blkdev_reset_all_zones() as REQ_OP_ZONE_RESET_ALL can be handled directly in blkdev_reset_zones() bio loop with an early break from the loop. This patch removes this function and modifies blkdev_reset_zones(), simplifying the code. Reviewed-by: Christoph Hellwig Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7fe376eede86a..14785011e7984 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -202,32 +202,14 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_report_zones); -/* - * Special case of zone reset operation to reset all zones in one command, - * useful for applications like mkfs. - */ -static int __blkdev_reset_all_zones(struct block_device *bdev, gfp_t gfp_mask) -{ - struct bio *bio = bio_alloc(gfp_mask, 0); - int ret; - - /* across the zones operations, don't need any sectors */ - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_ZONE_RESET_ALL, 0); - - ret = submit_bio_wait(bio); - bio_put(bio); - - return ret; -} - static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, + sector_t sector, sector_t nr_sectors) { if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) return false; - if (nr_sectors != part_nr_sects_read(bdev->bd_part)) + if (sector || nr_sectors != part_nr_sects_read(bdev->bd_part)) return false; /* * REQ_OP_ZONE_RESET_ALL can be executed only if the block device is @@ -270,9 +252,6 @@ int blkdev_reset_zones(struct block_device *bdev, /* Out of range */ return -EINVAL; - if (blkdev_allow_reset_all_zones(bdev, nr_sectors)) - return __blkdev_reset_all_zones(bdev, gfp_mask); - /* Check alignment (handle eventual smaller last zone) */ zone_sectors = blk_queue_zone_sectors(q); if (sector & (zone_sectors - 1)) @@ -283,17 +262,24 @@ int blkdev_reset_zones(struct block_device *bdev, return -EINVAL; while (sector < end_sector) { - bio = blk_next_bio(bio, 0, gfp_mask); - bio->bi_iter.bi_sector = sector; bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); + /* + * Special case for the zone reset operation that reset all + * zones, this is useful for applications like mkfs. + */ + if (blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { + bio->bi_opf = REQ_OP_ZONE_RESET_ALL; + break; + } + + bio->bi_opf = REQ_OP_ZONE_RESET; + bio->bi_iter.bi_sector = sector; sector += zone_sectors; /* This may take a while, so be nice to others */ cond_resched(); - } ret = submit_bio_wait(bio); From 6c1b1da58f8c7a697a88ae35afeba196fc7b701e Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:45 +0900 Subject: [PATCH 31/32] block: add zone open, close and finish operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zoned block devices (ZBC and ZAC devices) allow an explicit control over the condition (state) of zones. The operations allowed are: * Open a zone: Transition to open condition to indicate that a zone will actively be written * Close a zone: Transition to closed condition to release the drive resources used for writing to a zone * Finish a zone: Transition an open or closed zone to the full condition to prevent write operations To enable this control for in-kernel zoned block device users, define the new request operations REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH as well as the generic function blkdev_zone_mgmt() for submitting these operations on a range of zones. This results in blkdev_reset_zones() removal and replacement with this new zone magement function. Users of blkdev_reset_zones() (f2fs and dm-zoned) are updated accordingly. Contains contributions from Matias Bjorling, Hans Holmberg, Dmitry Fomichev, Keith Busch, Damien Le Moal and Christoph Hellwig. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-core.c | 12 +++++++++--- block/blk-zoned.c | 35 ++++++++++++++++++++-------------- drivers/md/dm-zoned-metadata.c | 6 +++--- fs/f2fs/segment.c | 3 ++- include/linux/blk_types.h | 25 ++++++++++++++++++++++++ include/linux/blkdev.h | 5 +++-- 6 files changed, 63 insertions(+), 23 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index ae506ac2dd489..f0d82227a2fc5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -132,6 +132,9 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), REQ_OP_NAME(ZONE_RESET_ALL), + REQ_OP_NAME(ZONE_OPEN), + REQ_OP_NAME(ZONE_CLOSE), + REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), @@ -849,10 +852,10 @@ static inline int blk_partition_remap(struct bio *bio) goto out; /* - * Zone reset does not include bi_size so bio_sectors() is always 0. - * Include a test for the reset op code and perform the remap if needed. + * Zone management bios do not have a sector count but they do have + * a start sector filled out and need to be remapped. */ - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) { + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) { if (bio_check_eod(bio, part_nr_sects_read(p))) goto out; bio->bi_iter.bi_sector += p->start_sect; @@ -936,6 +939,9 @@ generic_make_request_checks(struct bio *bio) goto not_supported; break; case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: if (!blk_queue_is_zoned(q)) goto not_supported; break; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 14785011e7984..dab34dc48fb6a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -221,23 +221,27 @@ static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, } /** - * blkdev_reset_zones - Reset zones write pointer + * blkdev_zone_mgmt - Execute a zone management operation on a range of zones * @bdev: Target block device - * @sector: Start sector of the first zone to reset - * @nr_sectors: Number of sectors, at least the length of one zone + * @op: Operation to be performed on the zones + * @sector: Start sector of the first zone to operate on + * @nr_sectors: Number of sectors, should be at least the length of one zone and + * must be zone size aligned. * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: - * Reset the write pointer of the zones contained in the range + * Perform the specified operation on the range of zones specified by * @sector..@sector+@nr_sectors. Specifying the entire disk sector range * is valid, but the specified range should not contain conventional zones. + * The operation to execute on each zone can be a zone reset, open, close + * or finish request. */ -int blkdev_reset_zones(struct block_device *bdev, - sector_t sector, sector_t nr_sectors, - gfp_t gfp_mask) +int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, + sector_t sector, sector_t nr_sectors, + gfp_t gfp_mask) { struct request_queue *q = bdev_get_queue(bdev); - sector_t zone_sectors; + sector_t zone_sectors = blk_queue_zone_sectors(q); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; int ret; @@ -248,12 +252,14 @@ int blkdev_reset_zones(struct block_device *bdev, if (bdev_read_only(bdev)) return -EPERM; + if (!op_is_zone_mgmt(op)) + return -EOPNOTSUPP; + if (!nr_sectors || end_sector > bdev->bd_part->nr_sects) /* Out of range */ return -EINVAL; /* Check alignment (handle eventual smaller last zone) */ - zone_sectors = blk_queue_zone_sectors(q); if (sector & (zone_sectors - 1)) return -EINVAL; @@ -269,12 +275,13 @@ int blkdev_reset_zones(struct block_device *bdev, * Special case for the zone reset operation that reset all * zones, this is useful for applications like mkfs. */ - if (blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { + if (op == REQ_OP_ZONE_RESET && + blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { bio->bi_opf = REQ_OP_ZONE_RESET_ALL; break; } - bio->bi_opf = REQ_OP_ZONE_RESET; + bio->bi_opf = op; bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -287,7 +294,7 @@ int blkdev_reset_zones(struct block_device *bdev, return ret; } -EXPORT_SYMBOL_GPL(blkdev_reset_zones); +EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); /* * BLKREPORTZONE ioctl processing. @@ -379,8 +386,8 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) return -EFAULT; - return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zrange.sector, zrange.nr_sectors, GFP_KERNEL); } static inline unsigned long *blk_alloc_zone_bitmap(int node, diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 595a73110e173..feb4718ce6a6e 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1312,9 +1312,9 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { struct dmz_dev *dev = zmd->dev; - ret = blkdev_reset_zones(dev->bdev, - dmz_start_sect(zmd, zone), - dev->zone_nr_sectors, GFP_NOIO); + ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, + dmz_start_sect(zmd, zone), + dev->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", dmz_id(zmd, zone), ret); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8087095814819..2c997f94a3b29 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1771,7 +1771,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, return -EIO; } trace_f2fs_issue_reset_zone(bdev, blkstart); - return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects, GFP_NOFS); } /* For conventional zones, use regular discard if supported */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1e7eeec16458a..23a2fd534817c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -290,6 +290,12 @@ enum req_opf { REQ_OP_ZONE_RESET_ALL = 8, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 9, + /* Open a zone */ + REQ_OP_ZONE_OPEN = 10, + /* Close a zone */ + REQ_OP_ZONE_CLOSE = 11, + /* Transition a zone to full */ + REQ_OP_ZONE_FINISH = 12, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, @@ -417,6 +423,25 @@ static inline bool op_is_discard(unsigned int op) return (op & REQ_OP_MASK) == REQ_OP_DISCARD; } +/* + * Check if a bio or request operation is a zone management operation, with + * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case + * due to its different handling in the block layer and device response in + * case of command failure. + */ +static inline bool op_is_zone_mgmt(enum req_opf op) +{ + switch (op & REQ_OP_MASK) { + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + return true; + default: + return false; + } +} + static inline int op_stat_group(unsigned int op) { if (op_is_discard(op)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d4051acb92a17..9cfafff86f66b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -360,8 +360,9 @@ extern unsigned int blkdev_nr_zones(struct block_device *bdev); extern int blkdev_report_zones(struct block_device *bdev, sector_t sector, struct blk_zone *zones, unsigned int *nr_zones); -extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, - sector_t nr_sectors, gfp_t gfp_mask); +extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, + sector_t sectors, sector_t nr_sectors, + gfp_t gfp_mask); extern int blk_revalidate_disk_zones(struct gendisk *disk); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, From e876df1fe0ad1b191284ee6ed2db7960bd322d00 Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:46 +0900 Subject: [PATCH 32/32] block: add zone open, close and finish ioctl support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce three new ioctl commands BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE to allow applications to control the condition of zones on a zoned block device through the execution of the REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH operations. Contains contributions from Matias Bjorling, Hans Holmberg, Dmitry Fomichev, Keith Busch, Damien Le Moal and Christoph Hellwig. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 28 +++++++++++++++++++++++----- block/ioctl.c | 5 ++++- include/linux/blkdev.h | 10 +++++----- include/uapi/linux/blkzoned.h | 17 ++++++++++++++--- 4 files changed, 46 insertions(+), 14 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index dab34dc48fb6a..481eaf7d04d45 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -357,15 +357,16 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, } /* - * BLKRESETZONE ioctl processing. + * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct request_queue *q; struct blk_zone_range zrange; + enum req_opf op; if (!argp) return -EINVAL; @@ -386,8 +387,25 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) return -EFAULT; - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - zrange.sector, zrange.nr_sectors, GFP_KERNEL); + switch (cmd) { + case BLKRESETZONE: + op = REQ_OP_ZONE_RESET; + break; + case BLKOPENZONE: + op = REQ_OP_ZONE_OPEN; + break; + case BLKCLOSEZONE: + op = REQ_OP_ZONE_CLOSE; + break; + case BLKFINISHZONE: + op = REQ_OP_ZONE_FINISH; + break; + default: + return -ENOTTY; + } + + return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); } static inline unsigned long *blk_alloc_zone_bitmap(int node, diff --git a/block/ioctl.c b/block/ioctl.c index 15a0eb80ada90..8756efb1419e4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -532,7 +532,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); case BLKRESETZONE: - return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg); + case BLKOPENZONE: + case BLKCLOSEZONE: + case BLKFINISHZONE: + return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg); case BLKGETZONESZ: return put_uint(arg, bdev_zone_sectors(bdev)); case BLKGETNRZONES: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cfafff86f66b..6a4f7abbdcf72 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -367,8 +367,8 @@ extern int blk_revalidate_disk_zones(struct gendisk *disk); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); -extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); +extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ @@ -389,9 +389,9 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev, return -ENOTTY; } -static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, - unsigned long arg) +static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) { return -ENOTTY; } diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 498eec813494c..0cdef67135f05 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -120,9 +120,11 @@ struct blk_zone_report { }; /** - * struct blk_zone_range - BLKRESETZONE ioctl request - * @sector: starting sector of the first zone to issue reset write pointer - * @nr_sectors: Total number of sectors of 1 or more zones to reset + * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/ + * BLKCLOSEZONE/BLKFINISHZONE ioctl + * requests + * @sector: Starting sector of the first zone to operate on. + * @nr_sectors: Total number of sectors of all zones to operate on. */ struct blk_zone_range { __u64 sector; @@ -139,10 +141,19 @@ struct blk_zone_range { * sector range. The sector range must be zone aligned. * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors. * @BLKGETNRZONES: Get the total number of zones of the device. + * @BLKOPENZONE: Open the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKCLOSEZONE: Close the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKFINISHZONE: Mark the zones as full in the specified sector range. + * The 512 B sector range must be zone aligned. */ #define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report) #define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) #define BLKGETZONESZ _IOR(0x12, 132, __u32) #define BLKGETNRZONES _IOR(0x12, 133, __u32) +#define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range) +#define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range) +#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) #endif /* _UAPI_BLKZONED_H */