From 3c75635f8ed482300931327847c50068a865a648 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 28 Jan 2025 14:39:20 +0100 Subject: [PATCH 001/108] io_uring/io-wq: eliminate redundant io_work_get_acct() calls Instead of calling io_work_get_acct() again, pass acct to io_wq_insert_work() and io_wq_remove_pending(). This atomic access in io_work_get_acct() was done under the `acct->lock`, and optimizing it away reduces lock contention a bit. Signed-off-by: Max Kellermann Link: https://lore.kernel.org/r/20250128133927.3989681-2-max.kellermann@ionos.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 5d0928f37471e..6d26f6f068af8 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -903,9 +903,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) } while (work); } -static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) +static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, struct io_wq_work *work) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash; struct io_wq_work *tail; @@ -951,7 +950,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) } raw_spin_lock(&acct->lock); - io_wq_insert_work(wq, work); + io_wq_insert_work(wq, acct, work); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); @@ -1021,10 +1020,10 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) } static inline void io_wq_remove_pending(struct io_wq *wq, + struct io_wq_acct *acct, struct io_wq_work *work, struct io_wq_work_node *prev) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; @@ -1051,7 +1050,7 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - io_wq_remove_pending(wq, work, prev); + io_wq_remove_pending(wq, acct, work, prev); raw_spin_unlock(&acct->lock); io_run_cancel(work, wq); match->nr_pending++; From 3d3bafd35fb422eb36cfc5709473cef7400588e7 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 28 Jan 2025 14:39:21 +0100 Subject: [PATCH 002/108] io_uring/io-wq: add io_worker.acct pointer This replaces the `IO_WORKER_F_BOUND` flag. All code that checks this flag is not interested in knowing whether this is a "bound" worker; all it does with this flag is determine the `io_wq_acct` pointer. At the cost of an extra pointer field, we can eliminate some fragile pointer arithmetic. In turn, the `create_index` and `index` fields are not needed anymore. Signed-off-by: Max Kellermann Link: https://lore.kernel.org/r/20250128133927.3989681-3-max.kellermann@ionos.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 6d26f6f068af8..197352ef78c7a 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -30,7 +30,6 @@ enum { IO_WORKER_F_UP = 0, /* up and active */ IO_WORKER_F_RUNNING = 1, /* account as running */ IO_WORKER_F_FREE = 2, /* worker on free list */ - IO_WORKER_F_BOUND = 3, /* is doing bounded work */ }; enum { @@ -46,12 +45,12 @@ enum { */ struct io_worker { refcount_t ref; - int create_index; unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; struct io_wq *wq; + struct io_wq_acct *acct; struct io_wq_work *cur_work; raw_spinlock_t lock; @@ -79,7 +78,6 @@ struct io_worker { struct io_wq_acct { unsigned nr_workers; unsigned max_workers; - int index; atomic_t nr_running; raw_spinlock_t lock; struct io_wq_work_list work_list; @@ -135,7 +133,7 @@ struct io_cb_cancel_data { bool cancel_all; }; -static bool create_io_worker(struct io_wq *wq, int index); +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct); static void io_wq_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wq *wq, struct io_wq_acct *acct, @@ -167,7 +165,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); + return worker->acct; } static void io_worker_ref_put(struct io_wq *wq) @@ -323,7 +321,7 @@ static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct) raw_spin_unlock(&wq->lock); atomic_inc(&acct->nr_running); atomic_inc(&wq->worker_refs); - return create_io_worker(wq, acct->index); + return create_io_worker(wq, acct); } static void io_wq_inc_running(struct io_worker *worker) @@ -343,7 +341,7 @@ static void create_worker_cb(struct callback_head *cb) worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; - acct = &wq->acct[worker->create_index]; + acct = worker->acct; raw_spin_lock(&wq->lock); if (acct->nr_workers < acct->max_workers) { @@ -352,7 +350,7 @@ static void create_worker_cb(struct callback_head *cb) } raw_spin_unlock(&wq->lock); if (do_create) { - create_io_worker(wq, worker->create_index); + create_io_worker(wq, acct); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -384,7 +382,6 @@ static bool io_queue_worker_create(struct io_worker *worker, atomic_inc(&wq->worker_refs); init_task_work(&worker->create_work, func); - worker->create_index = acct->index; if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { /* * EXIT may have been set after checking it above, check after @@ -821,9 +818,8 @@ static void io_workqueue_create(struct work_struct *work) kfree(worker); } -static bool create_io_worker(struct io_wq *wq, int index) +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) { - struct io_wq_acct *acct = &wq->acct[index]; struct io_worker *worker; struct task_struct *tsk; @@ -842,12 +838,10 @@ static bool create_io_worker(struct io_wq *wq, int index) refcount_set(&worker->ref, 1); worker->wq = wq; + worker->acct = acct; raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); - if (index == IO_WQ_ACCT_BOUND) - set_bit(IO_WORKER_F_BOUND, &worker->flags); - tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { io_init_new_worker(wq, worker, tsk); @@ -1176,7 +1170,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) for (i = 0; i < IO_WQ_ACCT_NR; i++) { struct io_wq_acct *acct = &wq->acct[i]; - acct->index = i; atomic_set(&acct->nr_running, 0); INIT_WQ_LIST(&acct->work_list); raw_spin_lock_init(&acct->lock); From 751eedc4b4b79332ecf1a78c0dbeb47d573a8f59 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 28 Jan 2025 14:39:22 +0100 Subject: [PATCH 003/108] io_uring/io-wq: move worker lists to struct io_wq_acct Have separate linked lists for bounded and unbounded workers. This way, io_acct_activate_free_worker() sees only workers relevant to it and doesn't need to skip irrelevant ones. This speeds up the linked list traversal (under acct->lock). The `io_wq.lock` field is moved to `io_wq_acct.workers_lock`. It did not actually protect "access to elements below", that is, not all of them; it only protected access to the worker lists. By having two locks instead of one, contention on this lock is reduced. Signed-off-by: Max Kellermann Link: https://lore.kernel.org/r/20250128133927.3989681-4-max.kellermann@ionos.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 162 ++++++++++++++++++++++++++++------------------- 1 file changed, 96 insertions(+), 66 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 197352ef78c7a..dfdd45ebe4bb4 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -76,9 +76,27 @@ struct io_worker { #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) struct io_wq_acct { + /** + * Protects access to the worker lists. + */ + raw_spinlock_t workers_lock; + unsigned nr_workers; unsigned max_workers; atomic_t nr_running; + + /** + * The list of free workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct hlist_nulls_head free_list; + + /** + * The list of all workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct list_head all_list; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long flags; @@ -110,12 +128,6 @@ struct io_wq { struct io_wq_acct acct[IO_WQ_ACCT_NR]; - /* lock protects access to elements below */ - raw_spinlock_t lock; - - struct hlist_nulls_head free_list; - struct list_head all_list; - struct wait_queue_entry wait; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; @@ -190,9 +202,9 @@ static void io_worker_cancel_cb(struct io_worker *worker) struct io_wq *wq = worker->wq; atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); @@ -211,6 +223,7 @@ static bool io_task_worker_match(struct callback_head *cb, void *data) static void io_worker_exit(struct io_worker *worker) { struct io_wq *wq = worker->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); while (1) { struct callback_head *cb = task_work_cancel_match(wq->task, @@ -224,11 +237,11 @@ static void io_worker_exit(struct io_worker *worker) io_worker_release(worker); wait_for_completion(&worker->ref_done); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_wq_dec_running(worker); /* * this worker is a goner, clear ->worker_private to avoid any @@ -267,8 +280,7 @@ static inline bool io_acct_run_queue(struct io_wq_acct *acct) * Check head of free list for an available worker. If one isn't available, * caller must create one. */ -static bool io_wq_activate_free_worker(struct io_wq *wq, - struct io_wq_acct *acct) +static bool io_acct_activate_free_worker(struct io_wq_acct *acct) __must_hold(RCU) { struct hlist_nulls_node *n; @@ -279,13 +291,9 @@ static bool io_wq_activate_free_worker(struct io_wq *wq, * activate. If a given worker is on the free_list but in the process * of exiting, keep trying. */ - hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { + hlist_nulls_for_each_entry_rcu(worker, n, &acct->free_list, nulls_node) { if (!io_worker_get(worker)) continue; - if (io_wq_get_acct(worker) != acct) { - io_worker_release(worker); - continue; - } /* * If the worker is already running, it's either already * starting work or finishing work. In either case, if it does @@ -312,13 +320,13 @@ static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct) if (unlikely(!acct->max_workers)) pr_warn_once("io-wq is not configured for unbound workers"); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return true; } acct->nr_workers++; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); atomic_inc(&acct->nr_running); atomic_inc(&wq->worker_refs); return create_io_worker(wq, acct); @@ -342,13 +350,13 @@ static void create_worker_cb(struct callback_head *cb) worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; acct = worker->acct; - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; do_create = true; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); if (do_create) { create_io_worker(wq, acct); } else { @@ -427,25 +435,25 @@ static void io_wq_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) +static void __io_worker_busy(struct io_wq_acct *acct, struct io_worker *worker) { if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { clear_bit(IO_WORKER_F_FREE, &worker->flags); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } } /* * No work, worker going to sleep. Move to freelist. */ -static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) - __must_hold(wq->lock) +static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) + __must_hold(acct->workers_lock) { if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { set_bit(IO_WORKER_F_FREE, &worker->flags); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); } } @@ -580,7 +588,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, if (!work) break; - __io_worker_busy(wq, worker); + __io_worker_busy(acct, worker); io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -651,20 +659,20 @@ static int io_wq_worker(void *data) while (io_acct_run_queue(acct)) io_worker_handle_work(acct, worker); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); /* * Last sleep timed out. Exit if we're not the last worker, * or if someone modified our affinity. */ if (last_timeout && (exit_mask || acct->nr_workers > 1)) { acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); __set_current_state(TASK_RUNNING); break; } last_timeout = false; - __io_worker_idle(wq, worker); - raw_spin_unlock(&wq->lock); + __io_worker_idle(acct, worker); + raw_spin_unlock(&acct->workers_lock); if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -725,18 +733,18 @@ void io_wq_worker_sleeping(struct task_struct *tsk) io_wq_dec_running(worker); } -static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, +static void io_init_new_worker(struct io_wq *wq, struct io_wq_acct *acct, struct io_worker *worker, struct task_struct *tsk) { tsk->worker_private = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, wq->cpu_mask); - raw_spin_lock(&wq->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); - list_add_tail_rcu(&worker->all_list, &wq->all_list); + raw_spin_lock(&acct->workers_lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); + list_add_tail_rcu(&worker->all_list, &acct->all_list); set_bit(IO_WORKER_F_FREE, &worker->flags); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); wake_up_new_task(tsk); } @@ -772,20 +780,20 @@ static void create_worker_cont(struct callback_head *cb) struct io_worker *worker; struct task_struct *tsk; struct io_wq *wq; + struct io_wq_acct *acct; worker = container_of(cb, struct io_worker, create_work); clear_bit_unlock(0, &worker->create_state); wq = worker->wq; + acct = io_wq_get_acct(worker); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); + io_init_new_worker(wq, acct, worker, tsk); io_worker_release(worker); return; } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { - struct io_wq_acct *acct = io_wq_get_acct(worker); - atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; if (!acct->nr_workers) { struct io_cb_cancel_data match = { @@ -793,11 +801,11 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); while (io_acct_cancel_pending_work(wq, acct, &match)) ; } else { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } io_worker_ref_put(wq); kfree(worker); @@ -829,9 +837,9 @@ static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) if (!worker) { fail: atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); return false; } @@ -844,7 +852,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); + io_init_new_worker(wq, acct, worker, tsk); } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; @@ -860,14 +868,14 @@ static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) * Iterate the passed in list and call the specific function for each * worker that isn't exiting */ -static bool io_wq_for_each_worker(struct io_wq *wq, - bool (*func)(struct io_worker *, void *), - void *data) +static bool io_acct_for_each_worker(struct io_wq_acct *acct, + bool (*func)(struct io_worker *, void *), + void *data) { struct io_worker *worker; bool ret = false; - list_for_each_entry_rcu(worker, &wq->all_list, all_list) { + list_for_each_entry_rcu(worker, &acct->all_list, all_list) { if (io_worker_get(worker)) { /* no task if node is/was offline */ if (worker->task) @@ -881,6 +889,18 @@ static bool io_wq_for_each_worker(struct io_wq *wq, return ret; } +static bool io_wq_for_each_worker(struct io_wq *wq, + bool (*func)(struct io_worker *, void *), + void *data) +{ + for (int i = 0; i < IO_WQ_ACCT_NR; i++) { + if (!io_acct_for_each_worker(&wq->acct[i], func, data)) + return false; + } + + return true; +} + static bool io_wq_worker_wake(struct io_worker *worker, void *data) { __set_notify_signal(worker->task); @@ -949,7 +969,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) raw_spin_unlock(&acct->lock); rcu_read_lock(); - do_create = !io_wq_activate_free_worker(wq, acct); + do_create = !io_acct_activate_free_worker(acct); rcu_read_unlock(); if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || @@ -960,12 +980,12 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) if (likely(did_create)) return; - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); /* fatal condition, failed to create the first worker */ io_acct_cancel_pending_work(wq, acct, &match); @@ -1072,11 +1092,22 @@ static void io_wq_cancel_pending_work(struct io_wq *wq, } } +static void io_acct_cancel_running_work(struct io_wq_acct *acct, + struct io_cb_cancel_data *match) +{ + raw_spin_lock(&acct->workers_lock); + io_acct_for_each_worker(acct, io_wq_worker_cancel, match); + raw_spin_unlock(&acct->workers_lock); +} + static void io_wq_cancel_running_work(struct io_wq *wq, struct io_cb_cancel_data *match) { rcu_read_lock(); - io_wq_for_each_worker(wq, io_wq_worker_cancel, match); + + for (int i = 0; i < IO_WQ_ACCT_NR; i++) + io_acct_cancel_running_work(&wq->acct[i], match); + rcu_read_unlock(); } @@ -1099,16 +1130,14 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. * - * Do both of these while holding the wq->lock, to ensure that + * Do both of these while holding the acct->workers_lock, to ensure that * we'll find a work item regardless of state. */ io_wq_cancel_pending_work(wq, &match); if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; - raw_spin_lock(&wq->lock); io_wq_cancel_running_work(wq, &match); - raw_spin_unlock(&wq->lock); if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; @@ -1132,7 +1161,7 @@ static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq_acct *acct = &wq->acct[i]; if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wq_activate_free_worker(wq, acct); + io_acct_activate_free_worker(acct); } rcu_read_unlock(); return 1; @@ -1171,14 +1200,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) struct io_wq_acct *acct = &wq->acct[i]; atomic_set(&acct->nr_running, 0); + + raw_spin_lock_init(&acct->workers_lock); + INIT_HLIST_NULLS_HEAD(&acct->free_list, 0); + INIT_LIST_HEAD(&acct->all_list); + INIT_WQ_LIST(&acct->work_list); raw_spin_lock_init(&acct->lock); } - raw_spin_lock_init(&wq->lock); - INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); - INIT_LIST_HEAD(&wq->all_list); - wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); @@ -1364,14 +1394,14 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) rcu_read_lock(); - raw_spin_lock(&wq->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { acct = &wq->acct[i]; + raw_spin_lock(&acct->workers_lock); prev[i] = max_t(int, acct->max_workers, prev[i]); if (new_count[i]) acct->max_workers = new_count[i]; + raw_spin_unlock(&acct->workers_lock); } - raw_spin_unlock(&wq->lock); rcu_read_unlock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) From 6ee78354eaa602002448f098b34678396d99043d Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 28 Jan 2025 14:39:23 +0100 Subject: [PATCH 004/108] io_uring/io-wq: cache work->flags in variable This eliminates several redundant atomic reads and therefore reduces the duration the surrounding spinlocks are held. In several io_uring benchmarks, this reduced the CPU time spent in queued_spin_lock_slowpath() considerably: io_uring benchmark with a flood of `IORING_OP_NOP` and `IOSQE_ASYNC`: 38.86% -1.49% [kernel.kallsyms] [k] queued_spin_lock_slowpath 6.75% +0.36% [kernel.kallsyms] [k] io_worker_handle_work 2.60% +0.19% [kernel.kallsyms] [k] io_nop 3.92% +0.18% [kernel.kallsyms] [k] io_req_task_complete 6.34% -0.18% [kernel.kallsyms] [k] io_wq_submit_work HTTP server, static file: 42.79% -2.77% [kernel.kallsyms] [k] queued_spin_lock_slowpath 2.08% +0.23% [kernel.kallsyms] [k] io_wq_submit_work 1.19% +0.20% [kernel.kallsyms] [k] amd_iommu_iotlb_sync_map 1.46% +0.15% [kernel.kallsyms] [k] ep_poll_callback 1.80% +0.15% [kernel.kallsyms] [k] io_worker_handle_work HTTP server, PHP: 35.03% -1.80% [kernel.kallsyms] [k] queued_spin_lock_slowpath 0.84% +0.21% [kernel.kallsyms] [k] amd_iommu_iotlb_sync_map 1.39% +0.12% [kernel.kallsyms] [k] _copy_to_iter 0.21% +0.10% [kernel.kallsyms] [k] update_sd_lb_stats Signed-off-by: Max Kellermann Link: https://lore.kernel.org/r/20250128133927.3989681-5-max.kellermann@ionos.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 33 +++++++++++++++++++++------------ io_uring/io-wq.h | 7 ++++++- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index dfdd45ebe4bb4..ba9974e6f5213 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -170,9 +170,9 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) } static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, - struct io_wq_work *work) + unsigned int work_flags) { - return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(work_flags & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) @@ -457,9 +457,14 @@ static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) } } +static inline unsigned int __io_get_work_hash(unsigned int work_flags) +{ + return work_flags >> IO_WQ_HASH_SHIFT; +} + static inline unsigned int io_get_work_hash(struct io_wq_work *work) { - return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; + return __io_get_work_hash(atomic_read(&work->flags)); } static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) @@ -489,17 +494,19 @@ static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, struct io_wq *wq = worker->wq; wq_list_for_each(node, prev, &acct->work_list) { + unsigned int work_flags; unsigned int hash; work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ - if (!io_wq_is_hashed(work)) { + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) { wq_list_del(&acct->work_list, node, prev); return work; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); /* all items with this hash lie in [work, tail] */ tail = wq->hash_tail[hash]; @@ -596,12 +603,13 @@ static void io_worker_handle_work(struct io_wq_acct *acct, /* handle a whole dependent link */ do { struct io_wq_work *next_hashed, *linked; - unsigned int hash = io_get_work_hash(work); + unsigned int work_flags = atomic_read(&work->flags); + unsigned int hash = __io_get_work_hash(work_flags); next_hashed = wq_next_work(work); if (do_kill && - (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + (work_flags & IO_WQ_WORK_UNBOUND)) atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -917,18 +925,19 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) } while (work); } -static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, struct io_wq_work *work) +static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, + struct io_wq_work *work, unsigned int work_flags) { unsigned int hash; struct io_wq_work *tail; - if (!io_wq_is_hashed(work)) { + if (!__io_wq_is_hashed(work_flags)) { append: wq_list_add_tail(&work->list, &acct->work_list); return; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); tail = wq->hash_tail[hash]; wq->hash_tail[hash] = work; if (!tail) @@ -944,8 +953,8 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int work_flags = atomic_read(&work->flags); + struct io_wq_acct *acct = io_work_get_acct(wq, work_flags); struct io_cb_cancel_data match = { .fn = io_wq_work_match_item, .data = work, @@ -964,7 +973,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) } raw_spin_lock(&acct->lock); - io_wq_insert_work(wq, acct, work); + io_wq_insert_work(wq, acct, work, work_flags); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index b3b004a7b6252..d4fb2940e435f 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -54,9 +54,14 @@ int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); int io_wq_max_workers(struct io_wq *wq, int *new_count); bool io_wq_worker_stopped(void); +static inline bool __io_wq_is_hashed(unsigned int work_flags) +{ + return work_flags & IO_WQ_WORK_HASHED; +} + static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; + return __io_wq_is_hashed(atomic_read(&work->flags)); } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); From 486ba4d84d62e92716cd395c4b1612b8ce70a257 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 28 Jan 2025 14:39:24 +0100 Subject: [PATCH 005/108] io_uring/io-wq: do not use bogus hash value Previously, the `hash` variable was initialized with `-1` and only updated by io_get_next_work() if the current work was hashed. Commit 60cf46ae6054 ("io-wq: hash dependent work") changed this to always call io_get_work_hash() even if the work was not hashed. This caused the `hash != -1U` check to always be true, adding some overhead for the `hash->wait` code. This patch fixes the regression by checking the `IO_WQ_WORK_HASHED` flag. Perf diff for a flood of `IORING_OP_NOP` with `IOSQE_ASYNC`: 38.55% -1.57% [kernel.kallsyms] [k] queued_spin_lock_slowpath 6.86% -0.72% [kernel.kallsyms] [k] io_worker_handle_work 0.10% +0.67% [kernel.kallsyms] [k] put_prev_entity 1.96% +0.59% [kernel.kallsyms] [k] io_nop_prep 3.31% -0.51% [kernel.kallsyms] [k] try_to_wake_up 7.18% -0.47% [kernel.kallsyms] [k] io_wq_free_work Fixes: 60cf46ae6054 ("io-wq: hash dependent work") Cc: Pavel Begunkov Signed-off-by: Max Kellermann Link: https://lore.kernel.org/r/20250128133927.3989681-6-max.kellermann@ionos.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index ba9974e6f5213..6e31f312b61a1 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -604,7 +604,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct, do { struct io_wq_work *next_hashed, *linked; unsigned int work_flags = atomic_read(&work->flags); - unsigned int hash = __io_get_work_hash(work_flags); + unsigned int hash = __io_wq_is_hashed(work_flags) + ? __io_get_work_hash(work_flags) + : -1U; next_hashed = wq_next_work(work); From 7d568502ef90e645e3f1afe4e10467d5952ddf87 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 28 Jan 2025 14:39:25 +0100 Subject: [PATCH 006/108] io_uring/io-wq: pass io_wq to io_get_next_work() The only caller has already determined this pointer, so let's skip the redundant dereference. Signed-off-by: Max Kellermann Link: https://lore.kernel.org/r/20250128133927.3989681-7-max.kellermann@ionos.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 6e31f312b61a1..f7d328feb7225 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -485,13 +485,12 @@ static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) } static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, - struct io_worker *worker) + struct io_wq *wq) __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; unsigned int stall_hash = -1U; - struct io_wq *wq = worker->wq; wq_list_for_each(node, prev, &acct->work_list) { unsigned int work_flags; @@ -576,7 +575,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(acct, worker); + work = io_get_next_work(acct, wq); if (work) { /* * Make sure cancelation can find this, even before From 40b991837f3293317c9845b549e10600e9d54611 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 31 Jan 2025 17:27:02 +0000 Subject: [PATCH 007/108] io_uring: deduplicate caches deallocation Add a function that frees all ring caches since we already have two spots repeating the same thing and it's easy to miss it and change only one of them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b6b0125677c58bdff99eda91ab320137406e8562.1738342562.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ceacf6230e342..9fade03ce0d97 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -282,6 +282,16 @@ static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) return 0; } +static void io_free_alloc_caches(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, kfree); + io_futex_cache_free(ctx); +} + static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -360,12 +370,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) free_ref: percpu_ref_exit(&ctx->refs); err: - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, kfree); - io_futex_cache_free(ctx); + io_free_alloc_caches(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); @@ -2702,12 +2707,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_sqe_files_unregister(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, kfree); - io_futex_cache_free(ctx); + io_free_alloc_caches(ctx); io_destroy_buffers(ctx); io_free_region(ctx, &ctx->param_region); mutex_unlock(&ctx->uring_lock); From 7215469659cb9751a9bf80e43b24a48749004d26 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 31 Jan 2025 17:28:21 +0000 Subject: [PATCH 008/108] io_uring: check for iowq alloc_workqueue failure alloc_workqueue() can fail even during init in io_uring_init(), check the result and panic if anything went wrong. Fixes: 73eaa2b583493 ("io_uring: use private workqueue for exit work") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3a046063902f888f66151f89fa42f84063b9727b.1738343083.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9fade03ce0d97..7fff5d612201b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3920,6 +3920,7 @@ static int __init io_uring_init(void) SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); + BUG_ON(!iou_wq); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); From 92a3bac9a57c39728226ab191859c85f5e2829c0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 31 Jan 2025 17:31:03 +0000 Subject: [PATCH 009/108] io_uring: sanitise ring params earlier Do all struct io_uring_params validation early on before allocating the context. That makes initialisation easier, especially by having fewer places where we need to care about partial de-initialisation. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/363ba90b83ff78eefdc88b60e1b2c4a39d182247.1738344646.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 77 ++++++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7fff5d612201b..e34a92c73a5d8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3535,6 +3535,44 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) O_RDWR | O_CLOEXEC, NULL); } +static int io_uring_sanitise_params(struct io_uring_params *p) +{ + unsigned flags = p->flags; + + /* There is no way to mmap rings without a real fd */ + if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && + !(flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + + if (flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + } + + if (flags & IORING_SETUP_TASKRUN_FLAG) { + if (!(flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_DEFER_TASKRUN))) + return -EINVAL; + } + + /* HYBRID_IOPOLL only valid with IOPOLL */ + if ((flags & IORING_SETUP_HYBRID_IOPOLL) && !(flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + /* + * For DEFER_TASKRUN we require the completion task to be the same as + * the submission task. This implies that there is only one submitter. + */ + if ((flags & IORING_SETUP_DEFER_TASKRUN) && + !(flags & IORING_SETUP_SINGLE_ISSUER)) + return -EINVAL; + + return 0; +} + int io_uring_fill_params(unsigned entries, struct io_uring_params *p) { if (!entries) @@ -3545,10 +3583,6 @@ int io_uring_fill_params(unsigned entries, struct io_uring_params *p) entries = IORING_MAX_ENTRIES; } - if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) - && !(p->flags & IORING_SETUP_NO_MMAP)) - return -EINVAL; - /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -3610,6 +3644,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, struct file *file; int ret; + ret = io_uring_sanitise_params(p); + if (ret) + return ret; + ret = io_uring_fill_params(entries, p); if (unlikely(ret)) return ret; @@ -3657,37 +3695,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ - ret = -EINVAL; - if (ctx->flags & IORING_SETUP_SQPOLL) { - /* IPI related flags don't make sense with SQPOLL */ - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | - IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_DEFER_TASKRUN)) - goto err; + if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && - !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) - goto err; + else ctx->notify_method = TWA_SIGNAL; - } - - /* HYBRID_IOPOLL only valid with IOPOLL */ - if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == - IORING_SETUP_HYBRID_IOPOLL) - goto err; - - /* - * For DEFER_TASKRUN we require the completion task to be the same as the - * submission task. This implies that there is only one submitter, so enforce - * that. - */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && - !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { - goto err; - } /* * This is just grabbed for accounting purposes. When a process exits, From 7919292a961421bfdb22f83c16657684c96076b3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:42 +0000 Subject: [PATCH 010/108] io_uring/kbuf: remove legacy kbuf bulk allocation Legacy provided buffers are slow and discouraged in favour of the ring variant. Remove the bulk allocation to keep it simpler as we don't care about performance. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a064d70370e590efed8076e9501ae4cfc20fe0ca.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8e72de7712ac9..f152afdf0bc74 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -501,12 +501,9 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return 0; } -#define IO_BUFFER_ALLOC_BATCH 64 - static int io_refill_buffer_cache(struct io_ring_ctx *ctx) { - struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; - int allocated; + struct io_buffer *buf; /* * Completions that don't happen inline (eg not under uring_lock) will @@ -524,27 +521,10 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx) spin_unlock(&ctx->completion_lock); } - /* - * No free buffers and no completion entries either. Allocate a new - * batch of buffer entries and add those to our freelist. - */ - - allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, - ARRAY_SIZE(bufs), (void **) bufs); - if (unlikely(!allocated)) { - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); - if (!bufs[0]) - return -ENOMEM; - allocated = 1; - } - - while (allocated) - list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); - + buf = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); + if (!buf) + return -ENOMEM; + list_add_tail(&buf->list, &ctx->io_buffers_cache); return 0; } From 9afe6847cff78e7f3aa8f4c920265cf298033251 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:43 +0000 Subject: [PATCH 011/108] io_uring/kbuf: remove legacy kbuf kmem cache Remove the kmem cache used by legacy provided buffers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8195c207d8524d94e972c0c82de99282289f7f5c.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 -- io_uring/io_uring.h | 1 - io_uring/kbuf.c | 6 ++---- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e34a92c73a5d8..6fa1e88e40fbe 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3927,8 +3927,6 @@ static int __init io_uring_init(void) req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); - io_buf_cachep = KMEM_CACHE(io_buffer, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); BUG_ON(!iou_wq); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ab619e63ef39c..85bc8f76ca190 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -418,7 +418,6 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) } extern struct kmem_cache *req_cachep; -extern struct kmem_cache *io_buf_cachep; static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index f152afdf0bc74..2e1561c9220fe 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -20,8 +20,6 @@ /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) -struct kmem_cache *io_buf_cachep; - struct io_provide_buf { struct file *file; __u64 addr; @@ -411,7 +409,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { buf = list_entry(item, struct io_buffer, list); - kmem_cache_free(io_buf_cachep, buf); + kfree(buf); } } @@ -521,7 +519,7 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx) spin_unlock(&ctx->completion_lock); } - buf = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) return -ENOMEM; list_add_tail(&buf->list, &ctx->io_buffers_cache); From dd4fbb11e7ccc15dbb197a5bbfb2ca8bfda89fcd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:44 +0000 Subject: [PATCH 012/108] io_uring/kbuf: move locking into io_kbuf_drop() Move the burden of locking out of the caller into io_kbuf_drop(), that will help with furher refactoring. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/530f0cf1f06963029399f819a9a58b1a34bebef3.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +---- io_uring/kbuf.h | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6fa1e88e40fbe..ed7c9081352a4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -398,11 +398,8 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); + if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) io_kbuf_drop(req); - spin_unlock(&req->ctx->completion_lock); - } if (req->flags & REQ_F_NEED_CLEANUP) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index bd80c44c5af1e..310f94a0727a6 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -174,13 +174,13 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, static inline void io_kbuf_drop(struct io_kiocb *req) { - lockdep_assert_held(&req->ctx->completion_lock); - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return; + spin_lock(&req->ctx->completion_lock); /* len == 0 is fine here, non-ring will always drop all of it */ __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); + spin_unlock(&req->ctx->completion_lock); } static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, From dc39fb1093ea33019f192c93b77b863282e10162 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:45 +0000 Subject: [PATCH 013/108] io_uring/kbuf: simplify __io_put_kbuf As a preparation step remove an optimisation from __io_put_kbuf() trying to use the locked cache. With that __io_put_kbuf_list() is only used with ->io_buffers_comp, and we remove the explicit list argument. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1b7f1394ec4afc7f96b35a61f5992e27c49fd067.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 26 +++----------------------- io_uring/kbuf.h | 7 +++---- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 2e1561c9220fe..3a43af9f7061f 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -70,29 +70,9 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); - } + spin_lock(&req->ctx->completion_lock); + __io_put_kbuf_list(req, len); + spin_unlock(&req->ctx->completion_lock); } static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 310f94a0727a6..1f28770648298 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -160,14 +160,13 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) return ret; } -static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, - struct list_head *list) +static inline void __io_put_kbuf_list(struct io_kiocb *req, int len) { if (req->flags & REQ_F_BUFFER_RING) { __io_put_kbuf_ring(req, len, 1); } else { req->buf_index = req->kbuf->bgid; - list_add(&req->kbuf->list, list); + list_add(&req->kbuf->list, &req->ctx->io_buffers_comp); req->flags &= ~REQ_F_BUFFER_SELECTED; } } @@ -179,7 +178,7 @@ static inline void io_kbuf_drop(struct io_kiocb *req) spin_lock(&req->ctx->completion_lock); /* len == 0 is fine here, non-ring will always drop all of it */ - __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); + __io_put_kbuf_list(req, 0); spin_unlock(&req->ctx->completion_lock); } From 13ee854e7c04236a47a5beaacdcf51eb0bc7a8fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:46 +0000 Subject: [PATCH 014/108] io_uring/kbuf: remove legacy kbuf caching Remove all struct io_buffer caches. It makes it a fair bit simpler. Apart from from killing a bunch of lines and juggling between lists, __io_put_kbuf_list() doesn't need ->completion_lock locking now. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/18287217466ee2576ea0b1e72daccf7b22c7e856.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 -- io_uring/io_uring.c | 2 -- io_uring/kbuf.c | 57 +++++----------------------------- io_uring/kbuf.h | 5 ++- 4 files changed, 9 insertions(+), 58 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3def525a1da37..e2fef264ff8b8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -360,7 +360,6 @@ struct io_ring_ctx { spinlock_t completion_lock; - struct list_head io_buffers_comp; struct list_head cq_overflow_list; struct hlist_head waitid_list; @@ -379,8 +378,6 @@ struct io_ring_ctx { unsigned int file_alloc_start; unsigned int file_alloc_end; - struct list_head io_buffers_cache; - /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ed7c9081352a4..969caaccce9d8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -323,7 +323,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); - INIT_LIST_HEAD(&ctx->io_buffers_cache); ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, sizeof(struct async_poll), 0); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, @@ -348,7 +347,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->completion_lock); raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); - INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 3a43af9f7061f..caf5b9bb2aecc 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -70,9 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { - spin_lock(&req->ctx->completion_lock); __io_put_kbuf_list(req, len); - spin_unlock(&req->ctx->completion_lock); } static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, @@ -345,7 +343,9 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *nxt; nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_move(&nxt->list, &ctx->io_buffers_cache); + list_del(&nxt->list); + kfree(nxt); + if (++i == nbufs) return i; cond_resched(); @@ -363,8 +363,6 @@ static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; - struct list_head *item, *tmp; - struct io_buffer *buf; while (1) { unsigned long index = 0; @@ -378,19 +376,6 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) break; io_put_bl(ctx, bl); } - - /* - * Move deferred locked entries to cache before pruning - */ - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) - list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - - list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { - buf = list_entry(item, struct io_buffer, list); - kfree(buf); - } } static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) @@ -479,33 +464,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return 0; } -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *buf; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); - if (!buf) - return -ENOMEM; - list_add_tail(&buf->list, &ctx->io_buffers_cache); - return 0; -} - static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, struct io_buffer_list *bl) { @@ -514,12 +472,11 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, int i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); + if (!buf) break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); + + list_add_tail(&buf->list, &bl->buf_list); buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 1f28770648298..c0b9636c5c4ae 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -166,8 +166,9 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req, int len) __io_put_kbuf_ring(req, len, 1); } else { req->buf_index = req->kbuf->bgid; - list_add(&req->kbuf->list, &req->ctx->io_buffers_comp); req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(req->kbuf); + req->kbuf = NULL; } } @@ -176,10 +177,8 @@ static inline void io_kbuf_drop(struct io_kiocb *req) if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return; - spin_lock(&req->ctx->completion_lock); /* len == 0 is fine here, non-ring will always drop all of it */ __io_put_kbuf_list(req, 0); - spin_unlock(&req->ctx->completion_lock); } static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, From e150e70fce425e1cdfc227974893cad9fb90a0d3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:47 +0000 Subject: [PATCH 015/108] io_uring/kbuf: open code __io_put_kbuf() __io_put_kbuf() is a trivial wrapper, open code it into __io_put_kbufs(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9dc17380272b48d56c95992c6f9eaacd5546e1d3.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 5 ----- io_uring/kbuf.h | 4 +--- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index caf5b9bb2aecc..d612e4c15b0ed 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -68,11 +68,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) return true; } -void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) -{ - __io_put_kbuf_list(req, len); -} - static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl) { diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index c0b9636c5c4ae..055b7a672f2e0 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -74,8 +74,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); - bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, @@ -194,7 +192,7 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, if (!__io_put_kbuf_ring(req, len, nbufs)) ret |= IORING_CQE_F_BUF_MORE; } else { - __io_put_kbuf(req, len, issue_flags); + __io_put_kbuf_list(req, len); } return ret; } From 54e00d9a612ab93f37f612a5ccd7c0c4f8a31cea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:48 +0000 Subject: [PATCH 016/108] io_uring/kbuf: introduce io_kbuf_drop_legacy() io_kbuf_drop() is only used for legacy provided buffers, and so __io_put_kbuf_list() is never called for REQ_F_BUFFER_RING. Remove the dead branch out of __io_put_kbuf_list(), rename it into io_kbuf_drop_legacy() and use it directly instead of io_kbuf_drop(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c8cc73e2272f09a86ecbdad9ebdd8304f8e583c0.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/kbuf.c | 10 ++++++++++ io_uring/kbuf.h | 24 ++---------------------- 3 files changed, 13 insertions(+), 23 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 969caaccce9d8..ec98a0ec6f34e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -397,7 +397,7 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) static void io_clean_op(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) - io_kbuf_drop(req); + io_kbuf_drop_legacy(req); if (req->flags & REQ_F_NEED_CLEANUP) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d612e4c15b0ed..815fb58da3ba6 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -50,6 +50,16 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } +void io_kbuf_drop_legacy(struct io_kiocb *req) +{ + if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + req->buf_index = req->kbuf->bgid; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(req->kbuf); + req->kbuf = NULL; +} + bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 055b7a672f2e0..3e18c916afc60 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -75,6 +75,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void io_kbuf_drop_legacy(struct io_kiocb *req); struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, unsigned int bgid); @@ -158,27 +159,6 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) return ret; } -static inline void __io_put_kbuf_list(struct io_kiocb *req, int len) -{ - if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req, len, 1); - } else { - req->buf_index = req->kbuf->bgid; - req->flags &= ~REQ_F_BUFFER_SELECTED; - kfree(req->kbuf); - req->kbuf = NULL; - } -} - -static inline void io_kbuf_drop(struct io_kiocb *req) -{ - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - - /* len == 0 is fine here, non-ring will always drop all of it */ - __io_put_kbuf_list(req, 0); -} - static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs, unsigned issue_flags) { @@ -192,7 +172,7 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, if (!__io_put_kbuf_ring(req, len, nbufs)) ret |= IORING_CQE_F_BUF_MORE; } else { - __io_put_kbuf_list(req, len); + io_kbuf_drop_legacy(req); } return ret; } From 5d3e51240d89678b87b5dc6987ea572048a0f0eb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:49 +0000 Subject: [PATCH 017/108] io_uring/kbuf: uninline __io_put_kbufs __io_put_kbufs() and other helper functions are too large to be inlined, compilers would normally refuse to do so. Uninline it and move together with io_kbuf_commit into kbuf.c. io_kbuf_commitSigned-off-by: Pavel Begunkov Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3dade7f55ad590e811aff83b1ec55c9c04e17b2b.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 60 +++++++++++++++++++++++++++++++++++++++ io_uring/kbuf.h | 74 +++++++------------------------------------------ 2 files changed, 70 insertions(+), 64 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 815fb58da3ba6..3478be6d02abb 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -20,6 +20,9 @@ /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) +/* Mapped buffer ring, return io_uring_buf from head */ +#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] + struct io_provide_buf { struct file *file; __u64 addr; @@ -29,6 +32,34 @@ struct io_provide_buf { __u16 bid; }; +bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) +{ + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) + return true; + + req->flags &= ~REQ_F_BUFFERS_COMMIT; + + if (unlikely(len < 0)) + return true; + + if (bl->flags & IOBL_INC) { + struct io_uring_buf *buf; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + if (WARN_ON_ONCE(len > buf->len)) + len = buf->len; + buf->len -= len; + if (buf->len) { + buf->addr += len; + return false; + } + } + + bl->head += nr; + return true; +} + static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { @@ -323,6 +354,35 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) +{ + struct io_buffer_list *bl = req->buf_list; + bool ret = true; + + if (bl) { + ret = io_kbuf_commit(req, bl, len, nr); + req->buf_index = bl->bgid; + } + req->flags &= ~REQ_F_BUFFER_RING; + return ret; +} + +unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) +{ + unsigned int ret; + + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + + if (unlikely(!(req->flags & REQ_F_BUFFER_RING))) { + io_kbuf_drop_legacy(req); + return ret; + } + + if (!__io_put_kbuf_ring(req, len, nbufs)) + ret |= IORING_CQE_F_BUF_MORE; + return ret; +} + static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer_list *bl, unsigned nbufs) { diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 3e18c916afc60..2ec0b983ce243 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -77,6 +77,10 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_drop_legacy(struct io_kiocb *req); +unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs); +bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr); + struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, unsigned int bgid); @@ -115,77 +119,19 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -/* Mapped buffer ring, return io_uring_buf from head */ -#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] - -static inline bool io_kbuf_commit(struct io_kiocb *req, - struct io_buffer_list *bl, int len, int nr) -{ - if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) - return true; - - req->flags &= ~REQ_F_BUFFERS_COMMIT; - - if (unlikely(len < 0)) - return true; - - if (bl->flags & IOBL_INC) { - struct io_uring_buf *buf; - - buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); - if (WARN_ON_ONCE(len > buf->len)) - len = buf->len; - buf->len -= len; - if (buf->len) { - buf->addr += len; - return false; - } - } - - bl->head += nr; - return true; -} - -static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) -{ - struct io_buffer_list *bl = req->buf_list; - bool ret = true; - - if (bl) { - ret = io_kbuf_commit(req, bl, len, nr); - req->buf_index = bl->bgid; - } - req->flags &= ~REQ_F_BUFFER_RING; - return ret; -} - -static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, - int nbufs, unsigned issue_flags) -{ - unsigned int ret; - - if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) - return 0; - - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); - if (req->flags & REQ_F_BUFFER_RING) { - if (!__io_put_kbuf_ring(req, len, nbufs)) - ret |= IORING_CQE_F_BUF_MORE; - } else { - io_kbuf_drop_legacy(req); - } - return ret; -} - static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { - return __io_put_kbufs(req, len, 1, issue_flags); + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbufs(req, len, 1); } static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, int nbufs, unsigned issue_flags) { - return __io_put_kbufs(req, len, nbufs, issue_flags); + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbufs(req, len, nbufs); } #endif From 1533376b131f5d76f8739e89efc78c4687d96bd3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Feb 2025 12:48:56 -0700 Subject: [PATCH 018/108] io_uring/cancel: add generic remove_all helper Any opcode that is cancelable ends up defining its own remove all helper, which iterates the pending list and cancels matches. Add a generic helper for it, which can be used by them. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 21 +++++++++++++++++++++ io_uring/cancel.h | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 4841935678394..4feacc57be63b 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -341,3 +341,24 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) fput(file); return ret; } + +bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + struct hlist_head *list, bool cancel_all, + bool (*cancel)(struct io_kiocb *)) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool found = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (!io_match_task_safe(req, tctx, cancel_all)) + continue; + hlist_del_init(&req->hash_node); + if (cancel(req)) + found = true; + } + + return found; +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index bbfea2cd00eaf..80734a0a2b268 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -24,6 +24,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); +bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + struct hlist_head *list, bool cancel_all, + bool (*cancel)(struct io_kiocb *)); + static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { if (req->cancel_seq_set && sequence == req->work.cancel_seq) From e855b9138470da9c1d2fe340acf653bd2af03922 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Feb 2025 12:51:26 -0700 Subject: [PATCH 019/108] io_uring/futex: convert to io_cancel_remove_all() Use the generic helper for cancelations. Signed-off-by: Jens Axboe --- io_uring/futex.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index 43e2143255f57..47b8c229a2e3f 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -90,7 +90,7 @@ static bool io_futexv_claim(struct io_futex *iof) return true; } -static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool __io_futex_cancel(struct io_kiocb *req) { /* futex wake already done or in progress */ if (req->opcode == IORING_OP_FUTEX_WAIT) { @@ -128,7 +128,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, if (req->cqe.user_data != cd->data && !(cd->flags & IORING_ASYNC_CANCEL_ANY)) continue; - if (__io_futex_cancel(ctx, req)) + if (__io_futex_cancel(req)) nr++; if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) break; @@ -144,21 +144,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (!io_match_task_safe(req, tctx, cancel_all)) - continue; - hlist_del_init(&req->hash_node); - __io_futex_cancel(ctx, req); - found = true; - } - - return found; + return io_cancel_remove_all(ctx, tctx, &ctx->futex_list, cancel_all, __io_futex_cancel); } int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) From 7d9944f5061e49cab5ee0e1c9507c2e8f94d41b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Feb 2025 12:52:46 -0700 Subject: [PATCH 020/108] io_uring/waitid: convert to io_cancel_remove_all() Use the generic helper for cancelations. Signed-off-by: Jens Axboe --- io_uring/waitid.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 15a7daf3ff4f3..87d19710d68a3 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -132,7 +132,7 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) io_req_set_res(req, ret, 0); } -static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool __io_waitid_cancel(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; @@ -170,7 +170,7 @@ int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, if (req->cqe.user_data != cd->data && !(cd->flags & IORING_ASYNC_CANCEL_ANY)) continue; - if (__io_waitid_cancel(ctx, req)) + if (__io_waitid_cancel(req)) nr++; if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) break; @@ -186,21 +186,7 @@ int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (!io_match_task_safe(req, tctx, cancel_all)) - continue; - hlist_del_init(&req->hash_node); - __io_waitid_cancel(ctx, req); - found = true; - } - - return found; + return io_cancel_remove_all(ctx, tctx, &ctx->waitid_list, cancel_all, __io_waitid_cancel); } static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) From 8fa374f90b721ae5f56605ff67fc94a8b583e10c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Feb 2025 13:13:58 -0700 Subject: [PATCH 021/108] io_uring/cancel: add generic cancel helper Any opcode that is cancelable ends up defining its own cancel helper for finding and canceling a specific request. Add a generic helper that can be used for this purpose. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 21 +++++++++++++++++++++ io_uring/cancel.h | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 4feacc57be63b..0870060bac7ca 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -362,3 +362,24 @@ bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, return found; } + +int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags, struct hlist_head *list, + bool (*cancel)(struct io_kiocb *)) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + int nr = 0; + + io_ring_submit_lock(ctx, issue_flags); + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (!io_cancel_req_match(req, cd)) + continue; + if (cancel(req)) + nr++; + if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) + break; + } + io_ring_submit_unlock(ctx, issue_flags); + return nr ?: -ENOENT; +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 80734a0a2b268..43e9bb74e9d19 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -28,6 +28,10 @@ bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, struct hlist_head *list, bool cancel_all, bool (*cancel)(struct io_kiocb *)); +int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags, struct hlist_head *list, + bool (*cancel)(struct io_kiocb *)); + static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { if (req->cancel_seq_set && sequence == req->work.cancel_seq) From 2eaa2fac4704d59f2fd07e496114c39303e54a18 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Feb 2025 13:15:57 -0700 Subject: [PATCH 022/108] io_uring/futex: use generic io_cancel_remove() helper Don't implement our own loop rolling and checking, just use the generic helper to find and cancel requests. Signed-off-by: Jens Axboe --- io_uring/futex.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index 47b8c229a2e3f..ede6279cadc6a 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -116,29 +116,7 @@ static bool __io_futex_cancel(struct io_kiocb *req) int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { - struct hlist_node *tmp; - struct io_kiocb *req; - int nr = 0; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) - return -ENOENT; - - io_ring_submit_lock(ctx, issue_flags); - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (req->cqe.user_data != cd->data && - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) - continue; - if (__io_futex_cancel(req)) - nr++; - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) - break; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (nr) - return nr; - - return -ENOENT; + return io_cancel_remove(ctx, cd, issue_flags, &ctx->futex_list, __io_futex_cancel); } bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, From 932de5e35fda2844aa258c640d198dbbce74bb8f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Feb 2025 13:16:29 -0700 Subject: [PATCH 023/108] io_uring/waitid: use generic io_cancel_remove() helper Don't implement our own loop rolling and checking, just use the generic helper to find and cancel requests. Signed-off-by: Jens Axboe --- io_uring/waitid.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 87d19710d68a3..5c443e5f6d928 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -158,29 +158,7 @@ static bool __io_waitid_cancel(struct io_kiocb *req) int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { - struct hlist_node *tmp; - struct io_kiocb *req; - int nr = 0; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) - return -ENOENT; - - io_ring_submit_lock(ctx, issue_flags); - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (req->cqe.user_data != cd->data && - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) - continue; - if (__io_waitid_cancel(req)) - nr++; - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) - break; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (nr) - return nr; - - return -ENOENT; + return io_cancel_remove(ctx, cd, issue_flags, &ctx->waitid_list, __io_waitid_cancel); } bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, From 7c71a0af81ba72de9b2c501065e4e718aba9a271 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 8 Feb 2025 10:50:34 -0700 Subject: [PATCH 024/108] io_uring/net: improve recv bundles Current recv bundles are only supported for multishot receives, and additionally they also always post at least 2 CQEs if more data is available than what a buffer will hold. This happens because the initial bundle recv will do a single buffer, and then do the rest of what is in the socket as a followup receive. As shown in a test program, if 1k buffers are available and 32k is available to receive in the socket, you'd get the following completions: bundle=1, mshot=0 cqe res 1024 cqe res 1024 [...] cqe res 1024 bundle=1, mshot=1 cqe res 1024 cqe res 31744 where bundle=1 && mshot=0 will post 32 1k completions, and bundle=1 && mshot=1 will post a 1k completion and then a 31k completion. To support bundle recv without multishot, it's possible to simply retry the recv immediately and post a single completion, rather than split it into two completions. With the below patch, the same test looks as follows: bundle=1, mshot=0 cqe res 32768 bundle=1, mshot=1 cqe res 32768 where mshot=0 works fine for bundles, and both of them post just a single 32k completion rather than split it into separate completions. Posting fewer completions is always a nice win, and not needing multishot for proper bundle efficiency is nice for cases that can't necessarily use multishot. Reported-by: Norman Maurer Link: https://lore.kernel.org/r/184f9f92-a682-4205-a15d-89e18f664502@kernel.dk Fixes: 2f9c9515bdfd ("io_uring/net: support bundles for recv") Signed-off-by: Jens Axboe --- io_uring/net.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index 17852a6616ffe..10344b3a6d89c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -76,6 +76,7 @@ struct io_sr_msg { /* initialised and used only by !msg send variants */ u16 buf_group; u16 buf_index; + bool retry; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -187,6 +188,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; + sr->retry = false; sr->len = 0; /* get from the provided buffer */ req->buf_index = sr->buf_group; } @@ -402,6 +404,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; + sr->retry = false; if (req->opcode != IORING_OP_SEND) { if (sqe->addr2 || sqe->file_index) @@ -785,6 +788,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; + sr->retry = false; if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -833,6 +837,9 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_recvmsg_prep_setup(req); } +/* bits to clear in old and inherit in new cflags on bundle retry */ +#define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) + /* * Finishes io_recv and io_recvmsg. * @@ -852,9 +859,19 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, if (sr->flags & IORING_RECVSEND_BUNDLE) { cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); + if (sr->retry) + cflags = req->cqe.flags | (cflags & CQE_F_MASK); /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) goto finish; + /* if more is available, retry and append to this one */ + if (!sr->retry && kmsg->msg.msg_inq > 0 && *ret > 0) { + req->cqe.flags = cflags & ~CQE_F_MASK; + sr->len = kmsg->msg.msg_inq; + sr->done_io += *ret; + sr->retry = true; + return false; + } } else { cflags |= io_put_kbuf(req, *ret, issue_flags); } @@ -1233,6 +1250,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *notif; zc->done_io = 0; + zc->retry = false; req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) From 0e8934724f78602635d6e11c97ef48caa693cb65 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 11 Feb 2025 13:19:56 -0700 Subject: [PATCH 025/108] io_uring: use IO_REQ_LINK_FLAGS more Replace the 2 instances of REQ_F_LINK | REQ_F_HARDLINK with the more commonly used IO_REQ_LINK_FLAGS. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250211202002.3316324-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ec98a0ec6f34e..8bb8c099c3e12 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -110,11 +110,13 @@ #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) + #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ REQ_F_ASYNC_DATA) -#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -131,7 +133,6 @@ struct io_defer_entry { /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) -#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) /* * No waiters. It's larger than any valid value of the tw counter @@ -1157,7 +1158,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, * We don't know how many reuqests is there in the link and whether * they can even be queued lazily, fall back to non-lazy. */ - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + if (req->flags & IO_REQ_LINK_FLAGS) flags &= ~IOU_F_TWQ_LAZY_WAKE; guard(rcu)(); From 60e6ce746bfcbe7541c205085c11ce0ff2ffd014 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 12 Feb 2025 09:48:05 -0700 Subject: [PATCH 026/108] io_uring: pass ctx instead of req to io_init_req_drain() io_init_req_drain() takes a struct io_kiocb *req argument but only uses it to get struct io_ring_ctx *ctx. The caller already knows the ctx, so pass it instead. Drop "req" from the function name since it operates on the ctx rather than a specific req. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250212164807.3681036-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8bb8c099c3e12..4a0944a57d963 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1997,9 +1997,8 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, return true; } -static void io_init_req_drain(struct io_kiocb *req) +static void io_init_drain(struct io_ring_ctx *ctx) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *head = ctx->submit_state.link.head; ctx->drain_active = true; @@ -2061,7 +2060,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) return io_init_fail_req(req, -EOPNOTSUPP); - io_init_req_drain(req); + io_init_drain(ctx); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { From 496f56bf9f1acf11ce14489f34d81ba6e4023f42 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sun, 16 Feb 2025 15:58:59 -0700 Subject: [PATCH 027/108] io_uring/rsrc: avoid NULL check in io_put_rsrc_node() Most callers of io_put_rsrc_node() already check that node is non-NULL: - io_rsrc_data_free() - io_sqe_buffer_register() - io_reset_rsrc_node() - io_req_put_rsrc_nodes() (REQ_F_BUF_NODE indicates non-NULL buf_node) Only io_splice_cleanup() can call io_put_rsrc_node() with a NULL node. So move the NULL check there. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250216225900.1075446-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.h | 2 +- io_uring/splice.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 190f7ee45de93..a6d883c62b221 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -83,7 +83,7 @@ static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { lockdep_assert_held(&ctx->uring_lock); - if (node && !--node->refs) + if (!--node->refs) io_free_rsrc_node(ctx, node); } diff --git a/io_uring/splice.c b/io_uring/splice.c index 5b84f16306116..7b89bd84d486d 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -51,7 +51,8 @@ void io_splice_cleanup(struct io_kiocb *req) { struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); - io_put_rsrc_node(req->ctx, sp->rsrc_node); + if (sp->rsrc_node) + io_put_rsrc_node(req->ctx, sp->rsrc_node); } static struct file *io_splice_get_file(struct io_kiocb *req, From bcf8a0293a019bb0c4aebafdebe9a1e7a923249a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sun, 16 Feb 2025 19:25:04 -0700 Subject: [PATCH 028/108] io_uring: introduce type alias for io_tw_state In preparation for changing how io_tw_state is passed, introduce a type alias io_tw_token_t for struct io_tw_state *. This allows for changing the representation in one place, without having to update the many functions that just forward their struct io_tw_state * argument. Also add a comment to struct io_tw_state to explain its purpose. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250217022511.1150145-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 9 ++++++++- io_uring/futex.c | 16 ++++++++-------- io_uring/io_uring.c | 28 ++++++++++++++-------------- io_uring/io_uring.h | 8 ++++---- io_uring/msg_ring.c | 2 +- io_uring/notif.c | 4 ++-- io_uring/poll.c | 18 +++++++++--------- io_uring/poll.h | 4 +++- io_uring/rw.c | 4 ++-- io_uring/rw.h | 3 ++- io_uring/timeout.c | 16 ++++++++-------- io_uring/uring_cmd.c | 2 +- io_uring/waitid.c | 8 ++++---- 13 files changed, 66 insertions(+), 56 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e2fef264ff8b8..ea4694ee9d199 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -436,8 +436,15 @@ struct io_ring_ctx { struct io_mapped_region param_region; }; +/* + * Token indicating function is called in task work context: + * ctx->uring_lock is held and any completions generated will be flushed. + * ONLY core io_uring.c should instantiate this struct. + */ struct io_tw_state { }; +/* Alias to use in code that doesn't instantiate struct io_tw_state */ +typedef struct io_tw_state *io_tw_token_t; enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, @@ -563,7 +570,7 @@ enum { REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); struct io_task_work { struct llist_node node; diff --git a/io_uring/futex.c b/io_uring/futex.c index ede6279cadc6a..b7581766406ce 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -44,30 +44,30 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->futex_cache, kfree); } -static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { req->async_data = NULL; hlist_del_init(&req->hash_node); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } -static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_futex_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; - io_tw_lock(ctx, ts); + io_tw_lock(ctx, tw); if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) kfree(ifd); - __io_futex_complete(req, ts); + __io_futex_complete(req, tw); } -static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (!iof->futexv_unqueued) { int res; @@ -79,7 +79,7 @@ static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) kfree(req->async_data); req->flags &= ~REQ_F_ASYNC_DATA; - __io_futex_complete(req, ts); + __io_futex_complete(req, tw); } static bool io_futexv_claim(struct io_futex *iof) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4a0944a57d963..b44ff88717258 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -543,7 +543,7 @@ static void io_queue_iowq(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) { io_queue_iowq(req); } @@ -1022,7 +1022,7 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) { if (!ctx) return; @@ -1277,7 +1277,7 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, } static int __io_run_local_work_loop(struct llist_node **node, - struct io_tw_state *ts, + io_tw_token_t tw, int events) { int ret = 0; @@ -1288,7 +1288,7 @@ static int __io_run_local_work_loop(struct llist_node **node, io_task_work.node); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, ts); + req, tw); *node = next; if (++ret >= events) break; @@ -1297,7 +1297,7 @@ static int __io_run_local_work_loop(struct llist_node **node, return ret; } -static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, +static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, int min_events, int max_events) { struct llist_node *node; @@ -1310,7 +1310,7 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: min_events -= ret; - ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); if (ctx->retry_llist.first) goto retry_done; @@ -1319,7 +1319,7 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - ret += __io_run_local_work_loop(&node, ts, max_events - ret); + ret += __io_run_local_work_loop(&node, tw, max_events - ret); ctx->retry_llist.first = node; loops++; @@ -1357,15 +1357,15 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, return ret; } -static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (unlikely(io_should_terminate_tw())) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) @@ -1583,7 +1583,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return 0; } -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) { io_req_complete_defer(req); } @@ -1763,9 +1763,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return ret; } -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| IO_URING_F_COMPLETE_DEFER); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 85bc8f76ca190..6c46d9cdd7aa8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -90,9 +90,9 @@ void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); @@ -104,7 +104,7 @@ int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); void io_req_queue_iowq(struct io_kiocb *req); -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void __io_submit_flush_completions(struct io_ring_ctx *ctx); @@ -376,7 +376,7 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) return task_work_pending(current) || io_local_work_pending(ctx); } -static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) { lockdep_assert_held(&ctx->uring_lock); } diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7e6f68e911f10..0bbcbbcdebfd9 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -71,7 +71,7 @@ static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) return target_ctx->task_complete; } -static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_ring_ctx *ctx = req->ctx; diff --git a/io_uring/notif.c b/io_uring/notif.c index ee3a33510b3c2..7bd92538dccbc 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -11,7 +11,7 @@ static const struct ubuf_info_ops io_ubuf_ops; -static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) +static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) { struct io_notif_data *nd = io_notif_to_data(notif); @@ -29,7 +29,7 @@ static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) } nd = nd->next; - io_req_task_complete(notif, ts); + io_req_task_complete(notif, tw); } while (nd); } diff --git a/io_uring/poll.c b/io_uring/poll.c index bb1c0cd4f809a..176854882ba66 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -220,7 +220,7 @@ static inline void io_poll_execute(struct io_kiocb *req, int res) * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ -static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) +static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; @@ -288,7 +288,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - int ret = io_poll_issue(req, ts); + int ret = io_poll_issue(req, tw); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) @@ -311,11 +311,11 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_NO_ACTION; } -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) +void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) { int ret; - ret = io_poll_check_events(req, ts); + ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { io_kbuf_recycle(req, 0); return; @@ -335,7 +335,7 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, ts); + io_req_task_submit(req, tw); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -343,14 +343,14 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } else { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, ts); + io_req_task_submit(req, tw); else io_req_defer_failed(req, ret); } diff --git a/io_uring/poll.h b/io_uring/poll.h index 04ede93113dc7..27e2db2ed4aee 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include + #define IO_POLL_ALLOC_CACHE_MAX 32 enum { @@ -43,4 +45,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); +void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw); diff --git a/io_uring/rw.c b/io_uring/rw.c index 7aa1e4c9f64a3..16f12f94943f7 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -511,7 +511,7 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -528,7 +528,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); io_req_rw_cleanup(req, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static void io_complete_rw(struct kiocb *kiocb, long res) diff --git a/io_uring/rw.h b/io_uring/rw.h index eaa59bd648709..a45e0c71b59d6 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include struct io_meta_state { @@ -39,7 +40,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags); int io_write(struct io_kiocb *req, unsigned int issue_flags); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); void io_rw_cache_free(const void *entry); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 48fc8cf707843..fec6ec7beb62e 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -65,7 +65,7 @@ static inline bool io_timeout_finish(struct io_timeout *timeout, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); -static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; @@ -82,7 +82,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) } } - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) @@ -154,9 +154,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_flush_killed_timeouts(&list, 0); } -static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) +static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) { - io_tw_lock(link->ctx, ts); + io_tw_lock(link->ctx, tw); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; @@ -165,7 +165,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, ts); + io_req_task_complete(link, tw); link = nxt; } } @@ -312,7 +312,7 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; @@ -330,11 +330,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index e6701b7aa1474..8bdf2c9b3fef9 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -102,7 +102,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) +static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 5c443e5f6d928..347b8f53efa7d 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -16,7 +16,7 @@ #include "waitid.h" #include "../kernel/exit.h" -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); +static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) @@ -185,13 +185,13 @@ static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) return true; } -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) +static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) { struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; - io_tw_lock(ctx, ts); + io_tw_lock(ctx, tw); ret = __do_wait(&iwa->wo); @@ -221,7 +221,7 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) } io_waitid_complete(req, ret); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, From 94a4274bb6ebc5b4293559304d0f00928de0d8c0 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sun, 16 Feb 2025 19:25:05 -0700 Subject: [PATCH 029/108] io_uring: pass struct io_tw_state by value 8e5b3b89ecaf ("io_uring: remove struct io_tw_state::locked") removed the only field of io_tw_state but kept it as a task work callback argument to "forc[e] users not to invoke them carelessly out of a wrong context". Passing the struct io_tw_state * argument adds a few instructions to all callers that can't inline the functions and see the argument is unused. So pass struct io_tw_state by value instead. Since it's a 0-sized value, it can be passed without any instructions needed to initialize it. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250217022511.1150145-2-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index ea4694ee9d199..123e693687305 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -444,7 +444,7 @@ struct io_ring_ctx { struct io_tw_state { }; /* Alias to use in code that doesn't instantiate struct io_tw_state */ -typedef struct io_tw_state *io_tw_token_t; +typedef struct io_tw_state io_tw_token_t; enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b44ff88717258..b688953d1de8e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -255,7 +255,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &ts); + req->io_task_work.func(req, ts); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -1052,24 +1052,24 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, io_task_work.node); if (req->ctx != ctx) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = req->ctx; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, &ts); + req, ts); node = next; (*count)++; if (unlikely(need_resched())) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = NULL; cond_resched(); } } while (node && *count < max_entries); - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); return node; } @@ -1341,7 +1341,7 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, if (!io_local_work_pending(ctx)) return 0; - return __io_run_local_work(ctx, &ts, min_events, + return __io_run_local_work(ctx, ts, min_events, max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } @@ -1352,7 +1352,7 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int ret; mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events, max_events); + ret = __io_run_local_work(ctx, ts, min_events, max_events); mutex_unlock(&ctx->uring_lock); return ret; } From 62aa9805d123165102273eb277f776aaca908e0e Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 11 Feb 2025 17:51:18 -0700 Subject: [PATCH 030/108] io_uring: use lockless_cq flag in io_req_complete_post() io_uring_create() computes ctx->lockless_cq as: ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL) So use it to simplify that expression in io_req_complete_post(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Li Zetao Link: https://lore.kernel.org/r/20250212005119.3433005-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b688953d1de8e..bd1ab21ed5393 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -900,7 +900,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + if (ctx->lockless_cq) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; From 3035deac0cd5bd9c8cacdcf5a1c488cbc87abc2d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 12:42:19 +0000 Subject: [PATCH 031/108] io_uring: introduce io_is_compat() A preparation patch adding a simple helper for gauging the compat state. It'll help us to optimise and compile out more code in the following commits. Signed-off-by: Pavel Begunkov Reviewed-by: Anuj Gupta Link: https://lore.kernel.org/r/1a87a640265196a67bc38300128e0bfd7839ab1f.1740400452.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 6c46d9cdd7aa8..d5c9b7a6911d0 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -147,6 +147,11 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) #endif } +static inline bool io_is_compat(struct io_ring_ctx *ctx) +{ + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); +} + static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, 0); From 0bba6fccbdcb28d284debc31150f84ef14f7e252 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 12:42:20 +0000 Subject: [PATCH 032/108] io_uring/cmd: optimise !CONFIG_COMPAT flags setting Use io_is_compat() to avoid extra overhead in io_uring_cmd() for flag setting when compat is compiled out. Signed-off-by: Pavel Begunkov Reviewed-by: Anuj Gupta Link: https://lore.kernel.org/r/f4d74c62d7cbddc386c0a9138ecd2b2ed6d3f146.1740400452.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 8bdf2c9b3fef9..14086a2664611 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -237,7 +237,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) issue_flags |= IO_URING_F_SQE128; if (ctx->flags & IORING_SETUP_CQE32) issue_flags |= IO_URING_F_CQE32; - if (ctx->compat) + if (io_is_compat(ctx)) issue_flags |= IO_URING_F_COMPAT; if (ctx->flags & IORING_SETUP_IOPOLL) { if (!file->f_op->uring_cmd_iopoll) From 82d187d356dcc257ecaa659e57e6c0546ec1cd2d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 12:42:21 +0000 Subject: [PATCH 033/108] io_uring/rw: compile out compat param passing Even when COMPAT is compiled out, we still have to pass ctx->compat to __import_iovec(). Replace the read with an indirection with a constant when the kernel doesn't support compat. Signed-off-by: Pavel Begunkov Reviewed-by: Anuj Gupta Link: https://lore.kernel.org/r/2819df9c8533c36b46d7baccbb317a0ec89da6cd.1740400452.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 16f12f94943f7..7133029b43966 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -46,7 +46,6 @@ static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) return false; } -#ifdef CONFIG_COMPAT static int io_iov_compat_buffer_select_prep(struct io_rw *rw) { struct compat_iovec __user *uiov; @@ -63,7 +62,6 @@ static int io_iov_compat_buffer_select_prep(struct io_rw *rw) rw->len = clen; return 0; } -#endif static int io_iov_buffer_select_prep(struct io_kiocb *req) { @@ -74,10 +72,8 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) if (rw->len != 1) return -EINVAL; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) return io_iov_compat_buffer_select_prep(rw); -#endif uiov = u64_to_user_ptr(rw->addr); if (copy_from_user(&iov, uiov, sizeof(*uiov))) @@ -120,7 +116,7 @@ static int __io_import_iovec(int ddir, struct io_kiocb *req, nr_segs = 1; } ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, - req->ctx->compat); + io_is_compat(req->ctx)); if (unlikely(ret < 0)) return ret; if (iov) { From 52524b281d5746cf9dbd53a7dffce9576e8ddd30 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 12:42:22 +0000 Subject: [PATCH 034/108] io_uring/rw: shrink io_iov_compat_buffer_select_prep Compat performance is not important and simplicity is more appreciated. Let's not be smart about it and use simpler copy_from_user() instead of access + __get_user pair. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b334a3a5040efa424ded58e4d8a6ef2554324266.1740400452.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 7133029b43966..22612a956e75a 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -48,18 +48,12 @@ static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) static int io_iov_compat_buffer_select_prep(struct io_rw *rw) { - struct compat_iovec __user *uiov; - compat_ssize_t clen; + struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr); + struct compat_iovec iov; - uiov = u64_to_user_ptr(rw->addr); - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) + if (copy_from_user(&iov, uiov, sizeof(iov))) return -EFAULT; - if (clen < 0) - return -EINVAL; - - rw->len = clen; + rw->len = iov.iov_len; return 0; } From 0cd64345c4ba127d27fa07a133d108ea92d38361 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 12:42:23 +0000 Subject: [PATCH 035/108] io_uring/waitid: use io_is_compat() Use io_is_compat() for consistency. Signed-off-by: Pavel Begunkov Reviewed-by: Anuj Gupta Link: https://lore.kernel.org/r/28c5b5f1f1bf7f4d18869dafe6e4147ce1bbf0f5.1740400452.git.asml.silence@gmail.com Link: https://lore.kernel.org/r/20250224172337.2009871-1-csander@purestorage.com [axboe: fold in improvement from Caleb, see link] Signed-off-by: Jens Axboe --- io_uring/waitid.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 347b8f53efa7d..54e69984cd8a8 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -42,7 +42,6 @@ static void io_waitid_free(struct io_kiocb *req) req->flags &= ~REQ_F_ASYNC_DATA; } -#ifdef CONFIG_COMPAT static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) { struct compat_siginfo __user *infop; @@ -67,7 +66,6 @@ static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) ret = false; goto done; } -#endif static bool io_waitid_copy_si(struct io_kiocb *req, int signo) { @@ -77,10 +75,8 @@ static bool io_waitid_copy_si(struct io_kiocb *req, int signo) if (!iw->infop) return true; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) return io_waitid_compat_copy_si(iw, signo); -#endif if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) return false; From 91864064622b17e74f49fd42689a052eaac4f08e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 12:42:24 +0000 Subject: [PATCH 036/108] io_uring/net: use io_is_compat() Use io_is_compat() for consistency. Signed-off-by: Pavel Begunkov Reviewed-by: Anuj Gupta Link: https://lore.kernel.org/r/fff93d9d08243284c5db5d546be766a82e85c130.1740400452.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 10344b3a6d89c..1d1107fd5beb0 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -317,7 +317,7 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, iomsg->msg.msg_iter.nr_segs = 0; #ifdef CONFIG_COMPAT - if (unlikely(req->ctx->compat)) { + if (io_is_compat(req->ctx)) { struct compat_msghdr cmsg; ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); @@ -428,10 +428,9 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->buf_list = NULL; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) sr->msg_flags |= MSG_CMSG_COMPAT; -#endif + if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG) @@ -717,7 +716,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, iomsg->msg.msg_iter.nr_segs = 0; #ifdef CONFIG_COMPAT - if (unlikely(req->ctx->compat)) { + if (io_is_compat(req->ctx)) { struct compat_msghdr cmsg; ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); @@ -829,10 +828,9 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) sr->msg_flags |= MSG_CMSG_COMPAT; -#endif + sr->nr_multishot_loops = 0; return io_recvmsg_prep_setup(req); } @@ -1293,10 +1291,9 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) zc->msg_flags |= MSG_CMSG_COMPAT; -#endif + if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG_ZC) From f6a89bf5278d6e15016a736db67043560d1b50d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 23 Feb 2025 17:22:29 +0000 Subject: [PATCH 037/108] io_uring/net: fix accept multishot handling REQ_F_APOLL_MULTISHOT doesn't guarantee it's executed from the multishot context, so a multishot accept may get executed inline, fail io_req_post_cqe(), and ask the core code to kill the request with -ECANCELED by returning IOU_STOP_MULTISHOT even when a socket has been accepted and installed. Cc: stable@vger.kernel.org Fixes: 390ed29b5e425 ("io_uring: add IORING_ACCEPT_MULTISHOT for accept") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/51c6deb01feaa78b08565ca8f24843c017f5bc80.1740331076.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index 1d1107fd5beb0..926cdb8d3350c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1641,6 +1641,8 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) } io_req_set_res(req, ret, cflags); + if (!(issue_flags & IO_URING_F_MULTISHOT)) + return IOU_OK; return IOU_STOP_MULTISHOT; } From 185523ebc85342ed90c5a80034f281ac09fd29db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 23 Feb 2025 17:22:30 +0000 Subject: [PATCH 038/108] io_uring/net: canonise accept mshot handling Use a more recognisable pattern for mshot accept, first try to post an mshot cqe if needed and after do terminating handling. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/daf5c0df7e2966deb0a115021c065fc6161a52d7.1740331076.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 926cdb8d3350c..8283a1f55192f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1612,7 +1612,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) } if (ret == -ERESTARTSYS) ret = -EINTR; - req_set_fail(req); } else if (!fixed) { fd_install(fd, file); ret = fd; @@ -1625,14 +1624,8 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) if (!arg.is_empty) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, ret, cflags); - return IOU_OK; - } - - if (ret < 0) - return ret; - if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { + if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && + io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) goto retry; if (issue_flags & IO_URING_F_MULTISHOT) @@ -1641,6 +1634,8 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) } io_req_set_res(req, ret, cflags); + if (ret < 0) + req_set_fail(req); if (!(issue_flags & IO_URING_F_MULTISHOT)) return IOU_OK; return IOU_STOP_MULTISHOT; From c457eed55d80bc06c2c55cd5d7a4646f102db5d4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 23 Feb 2025 17:22:31 +0000 Subject: [PATCH 039/108] io_uring: make io_poll_issue() sturdier io_poll_issue() forwards the call to io_issue_sqe() and thus inherits some of the handling. That's not particularly failure resistant, as for example returning an innocently looking IOU_OK from a multishot issue will lead to severe bugs. Reimplement io_poll_issue() without io_issue_sqe()'s request completion logic. Remove extra checks as we know that req->file is already set, linked timeout are armed, and iopoll is not supported. Also cover it with warnings for now. The patch should be useful by itself, but it's also preparing the codebase for other future clean ups. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3096d7b1026d9a52426a598bdfc8d9d324555545.1740331076.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bd1ab21ed5393..d169043fc35a9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1720,15 +1720,13 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, return !!req->file; } -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +static inline int __io_issue_sqe(struct io_kiocb *req, + unsigned int issue_flags, + const struct io_issue_def *def) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; int ret; - if (unlikely(!io_assign_file(req, def, issue_flags))) - return -EBADF; - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); @@ -1743,6 +1741,19 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (creds) revert_creds(creds); + return ret; +} + +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + int ret; + + if (unlikely(!io_assign_file(req, def, issue_flags))) + return -EBADF; + + ret = __io_issue_sqe(req, issue_flags, def); + if (ret == IOU_OK) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); @@ -1765,9 +1776,24 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) { + const unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_MULTISHOT | + IO_URING_F_COMPLETE_DEFER; + int ret; + io_tw_lock(req->ctx, tw); - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| - IO_URING_F_COMPLETE_DEFER); + + WARN_ON_ONCE(!req->file); + if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EFAULT; + + ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); + + WARN_ON_ONCE(ret == IOU_OK); + + if (ret == IOU_ISSUE_SKIP_COMPLETE) + ret = 0; + return ret; } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) From c72282dd865ee66bc1b8fbc843deefe53beb426c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 19:45:03 +0000 Subject: [PATCH 040/108] io_uring/rw: allocate async data in io_prep_rw() rw always allocates async_data, so instead of doing that deeper in prep calls inside of io_prep_rw_setup(), be a bit more explicit and do that early on in io_prep_rw(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5ead621051bc3374d1e8d96f816454906a6afd71.1740425922.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 22612a956e75a..7efc2337c5a0c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -203,9 +203,6 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) { struct io_async_rw *rw; - if (io_rw_alloc_async(req)) - return -ENOMEM; - if (!do_import || io_do_buffer_select(req)) return 0; @@ -262,6 +259,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, u64 attr_type_mask; int ret; + if (io_rw_alloc_async(req)) + return -ENOMEM; + rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); From 74c942499917e5d011ae414a026dda00a995a09b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 19:45:04 +0000 Subject: [PATCH 041/108] io_uring/rw: rename io_import_iovec() io_import_iovec() is not limited to iovecs but also imports buffers for normal reads and selected buffers, rename it for clarity. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/91cea59340b61a8f52dc7b8e720274577a25188c.1740425922.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 7efc2337c5a0c..e636be4850a7a 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -76,7 +76,7 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) return 0; } -static int __io_import_iovec(int ddir, struct io_kiocb *req, +static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, struct io_async_rw *io, unsigned int issue_flags) { @@ -122,13 +122,13 @@ static int __io_import_iovec(int ddir, struct io_kiocb *req, return 0; } -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct io_async_rw *io, - unsigned int issue_flags) +static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) { int ret; - ret = __io_import_iovec(rw, req, io, issue_flags); + ret = __io_import_rw_buffer(rw, req, io, issue_flags); if (unlikely(ret < 0)) return ret; @@ -207,7 +207,7 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) return 0; rw = req->async_data; - return io_import_iovec(ddir, req, rw, 0); + return io_import_rw_buffer(ddir, req, rw, 0); } static inline void io_meta_save_state(struct io_async_rw *io) @@ -845,7 +845,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) loff_t *ppos; if (io_do_buffer_select(req)) { - ret = io_import_iovec(ITER_DEST, req, io, issue_flags); + ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; } From 99fab04778da20d2b7e224cb6932eb2ad532f5d8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 19:45:05 +0000 Subject: [PATCH 042/108] io_uring/rw: extract helper for iovec import Split out a helper out of __io_import_rw_buffer() that handles vectored buffers. I'll need it for registered vectored buffers, but it also looks cleaner, especially with parameters being properly named. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/075470cfb24be38709d946815f35ec846d966f41.1740425922.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 57 ++++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index e636be4850a7a..4f7fa25208201 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -76,41 +76,24 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) return 0; } -static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, - struct io_async_rw *io, - unsigned int issue_flags) +static int io_import_vec(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + const struct iovec __user *uvec, + size_t uvec_segs) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + int ret, nr_segs; struct iovec *iov; - void __user *buf; - int nr_segs, ret; - size_t sqe_len; - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return -ENOBUFS; - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - - return import_ubuf(ddir, buf, sqe_len, &io->iter); - } if (io->free_iovec) { nr_segs = io->free_iov_nr; iov = io->free_iovec; } else { - iov = &io->fast_iov; nr_segs = 1; + iov = &io->fast_iov; } - ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, - io_is_compat(req->ctx)); + + ret = __import_iovec(ddir, uvec, uvec_segs, nr_segs, &iov, &io->iter, + io_is_compat(req->ctx)); if (unlikely(ret < 0)) return ret; if (iov) { @@ -122,6 +105,28 @@ static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, return 0; } +static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + void __user *buf = u64_to_user_ptr(rw->addr); + size_t sqe_len = rw->len; + + if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) + return io_import_vec(ddir, req, io, buf, sqe_len); + + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + rw->len = sqe_len; + } + return import_ubuf(ddir, buf, sqe_len, &io->iter); +} + static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, struct io_async_rw *io, unsigned int issue_flags) From 7a9b0d6925b2b13640a94ca6e72a753bb39526b2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 19:45:06 +0000 Subject: [PATCH 043/108] io_uring/rw: open code io_prep_rw_setup() Open code io_prep_rw_setup() into its only caller, it doesn't provide any meaningful abstraction anymore. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/61ba72e2d46119db71f27ab908018e6a6cd6c064.1740425922.git.asml.silence@gmail.com [axboe: fold in 'ret' being unused fix] Signed-off-by: Jens Axboe --- io_uring/rw.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 4f7fa25208201..10a4a95ada028 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -204,17 +204,6 @@ static int io_rw_alloc_async(struct io_kiocb *req) return 0; } -static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) -{ - struct io_async_rw *rw; - - if (!do_import || io_do_buffer_select(req)) - return 0; - - rw = req->async_data; - return io_import_rw_buffer(ddir, req, rw, 0); -} - static inline void io_meta_save_state(struct io_async_rw *io) { io->meta_state.seed = io->meta.seed; @@ -287,10 +276,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - ret = io_prep_rw_setup(req, ddir, do_import); - if (unlikely(ret)) - return ret; + if (do_import && !io_do_buffer_select(req)) { + struct io_async_rw *io = req->async_data; + + ret = io_import_rw_buffer(ddir, req, io, 0); + if (unlikely(ret)) + return ret; + } attr_type_mask = READ_ONCE(sqe->attr_type_mask); if (attr_type_mask) { @@ -301,9 +294,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; attr_ptr = READ_ONCE(sqe->attr_ptr); - ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); } - return ret; + return 0; } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) From c5b47d5a8c0d3c657751f803295213284f432672 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Feb 2025 13:31:06 -0800 Subject: [PATCH 044/108] io_uring/rsrc: remove redundant check for valid imu The only caller to io_buffer_unmap already checks if the node's buf is not null, so no need to check again. Signed-off-by: Keith Busch Reviewed-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250224213116.3509093-2-kbusch@meta.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index af39b69eb4fde..f3a41132a9668 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -103,19 +103,16 @@ static int io_buffer_validate(struct iovec *iov) static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { + struct io_mapped_ubuf *imu = node->buf; unsigned int i; - if (node->buf) { - struct io_mapped_ubuf *imu = node->buf; - - if (!refcount_dec_and_test(&imu->refs)) - return; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); - } + if (!refcount_dec_and_test(&imu->refs)) + return; + for (i = 0; i < imu->nr_bvecs; i++) + unpin_user_page(imu->bvec[i].bv_page); + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + kvfree(imu); } struct io_rsrc_node *io_rsrc_node_alloc(int type) From a14ca7a413ec8a9f9184c543691f890a011ed98a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Feb 2025 13:31:07 -0800 Subject: [PATCH 045/108] io_uring/nop: reuse req->buf_index There is already a field in io_kiocb that can store a registered buffer index, use that instead of stashing the value into struct io_nop. Signed-off-by: Keith Busch Reviewed-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250224213116.3509093-3-kbusch@meta.com Signed-off-by: Jens Axboe --- io_uring/nop.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/io_uring/nop.c b/io_uring/nop.c index 5e5196df650a1..ea539531cb5f6 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -16,7 +16,6 @@ struct io_nop { struct file *file; int result; int fd; - int buffer; unsigned int flags; }; @@ -40,9 +39,7 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) else nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) - nop->buffer = READ_ONCE(sqe->buf_index); - else - nop->buffer = -1; + req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -69,7 +66,7 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) ret = -EFAULT; io_ring_submit_lock(ctx, issue_flags); - node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer); + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); if (node) { io_req_assign_buf_node(req, node); ret = 0; From 81cc96fcb3dcfb3d85df4e0eec56149b5b53c016 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 13:31:08 -0800 Subject: [PATCH 046/108] io_uring/net: reuse req->buf_index for sendzc There is already a field in io_kiocb that can store a registered buffer index, use that instead of stashing the value into struct io_sr_msg. Reviewed-by: Keith Busch Signed-off-by: Pavel Begunkov Reviewed-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250224213116.3509093-4-kbusch@meta.com Signed-off-by: Jens Axboe --- io_uring/net.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 8283a1f55192f..22fa1664a6b1d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -75,7 +75,6 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; - u16 buf_index; bool retry; void __user *msg_control; /* used only for send zerocopy */ @@ -1287,7 +1286,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; - zc->buf_index = READ_ONCE(sqe->buf_index); + req->buf_index = READ_ONCE(sqe->buf_index); if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -1363,7 +1362,7 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) ret = -EFAULT; io_ring_submit_lock(ctx, issue_flags); - node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); if (node) { io_req_assign_buf_node(sr->notif, node); ret = 0; From 69d483d5f43e7a525246090c80f978b827104ad4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 13:31:09 -0800 Subject: [PATCH 047/108] io_uring/nvme: pass issue_flags to io_uring_cmd_import_fixed() io_uring_cmd_import_fixed() will need to know the io_uring execution state in following commits, for now just pass issue_flags into it without actually using. Reviewed-by: Keith Busch Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250224213116.3509093-5-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 10 ++++++---- include/linux/io_uring/cmd.h | 6 ++++-- io_uring/uring_cmd.c | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e8930146847af..e0876bc9aacde 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -114,7 +114,8 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - struct io_uring_cmd *ioucmd, unsigned int flags) + struct io_uring_cmd *ioucmd, unsigned int flags, + unsigned int iou_issue_flags) { struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; @@ -142,7 +143,8 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) return -EINVAL; ret = io_uring_cmd_import_fixed(ubuffer, bufflen, - rq_data_dir(req), &iter, ioucmd); + rq_data_dir(req), &iter, ioucmd, + iou_issue_flags); if (ret < 0) goto out; ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); @@ -194,7 +196,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, req->timeout = timeout; if (ubuffer && bufflen) { ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, - meta_len, NULL, flags); + meta_len, NULL, flags, 0); if (ret) return ret; } @@ -514,7 +516,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (d.addr && d.data_len) { ret = nvme_map_user_request(req, d.addr, d.data_len, nvme_to_user_ptr(d.metadata), - d.metadata_len, ioucmd, vec); + d.metadata_len, ioucmd, vec, issue_flags); if (ret) return ret; } diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index abd0c8bd950ba..87150dc0a07cf 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -39,7 +39,8 @@ static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd); + struct iov_iter *iter, void *ioucmd, + unsigned int issue_flags); /* * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd @@ -67,7 +68,8 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) + struct iov_iter *iter, void *ioucmd, + unsigned int issue_flags) { return -EOPNOTSUPP; } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 14086a2664611..28ed69c40756e 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -257,7 +257,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) + struct iov_iter *iter, void *ioucmd, + unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); struct io_rsrc_node *node = req->buf_node; From 5d309914773370308eb98d1db664eb18f502c5a6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 13:31:10 -0800 Subject: [PATCH 048/108] io_uring: combine buffer lookup and import Registered buffer are currently imported in two steps, first we lookup a rsrc node and then use it to set up the iterator. The first part is usually done at the prep stage, and import happens whenever it's needed. As we want to defer binding to a node so that it works with linked requests, combine both steps into a single helper. Reviewed-by: Keith Busch Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250224213116.3509093-6-kbusch@meta.com Signed-off-by: Jens Axboe --- io_uring/net.c | 22 ++++------------------ io_uring/rsrc.c | 31 ++++++++++++++++++++++++++++++- io_uring/rsrc.h | 6 +++--- io_uring/rw.c | 9 +-------- io_uring/uring_cmd.c | 25 ++++--------------------- 5 files changed, 42 insertions(+), 51 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 22fa1664a6b1d..3d93af98967a6 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1357,24 +1357,10 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) int ret; if (sr->flags & IORING_RECVSEND_FIXED_BUF) { - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - - ret = -EFAULT; - io_ring_submit_lock(ctx, issue_flags); - node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); - if (node) { - io_req_assign_buf_node(sr->notif, node); - ret = 0; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (unlikely(ret)) - return ret; - - ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, - node->buf, (u64)(uintptr_t)sr->buf, - sr->len); + sr->notif->buf_index = req->buf_index; + ret = io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, + (u64)(uintptr_t)sr->buf, sr->len, + ITER_SOURCE, issue_flags); if (unlikely(ret)) return ret; kmsg->msg.sg_from_iter = io_sg_from_iter; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f3a41132a9668..6cf7dba44d5b1 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -857,7 +857,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -int io_import_fixed(int ddir, struct iov_iter *iter, +static int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { @@ -916,6 +916,35 @@ int io_import_fixed(int ddir, struct iov_iter *iter, return 0; } +static inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; + + if (req->flags & REQ_F_BUF_NODE) + return req->buf_node; + + io_ring_submit_lock(ctx, issue_flags); + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); + if (node) + io_req_assign_buf_node(req, node); + io_ring_submit_unlock(ctx, issue_flags); + return node; +} + +int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags) +{ + struct io_rsrc_node *node; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + return io_import_fixed(ddir, iter, node->buf, buf_addr, len); +} + /* Lock two rings at once. The rings must be different! */ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index a6d883c62b221..ce199eb0ac9fb 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -50,9 +50,9 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len); +int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); diff --git a/io_uring/rw.c b/io_uring/rw.c index 10a4a95ada028..ae44d7e5d200c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -342,8 +342,6 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; struct io_async_rw *io; int ret; @@ -351,13 +349,8 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe if (unlikely(ret)) return ret; - node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); - if (!node) - return -EFAULT; - io_req_assign_buf_node(req, node); - io = req->async_data; - ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); + ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, 0); iov_iter_save_state(&io->iter, &io->iter_state); return ret; } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 28ed69c40756e..31d5e0948af14 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -199,21 +199,9 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; - if (ioucmd->flags & IORING_URING_CMD_FIXED) { - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - u16 index = READ_ONCE(sqe->buf_index); - - node = io_rsrc_node_lookup(&ctx->buf_table, index); - if (unlikely(!node)) - return -EFAULT; - /* - * Pi node upfront, prior to io_uring_cmd_import_fixed() - * being called. This prevents destruction of the mapped buffer - * we'll need at actual import time. - */ - io_req_assign_buf_node(req, node); - } + if (ioucmd->flags & IORING_URING_CMD_FIXED) + req->buf_index = READ_ONCE(sqe->buf_index); + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); return io_uring_cmd_prep_setup(req, sqe); @@ -261,13 +249,8 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - struct io_rsrc_node *node = req->buf_node; - - /* Must have had rsrc_node assigned at prep time */ - if (node) - return io_import_fixed(rw, iter, node->buf, ubuf, len); - return -EFAULT; + return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags); } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); From 80b3de7da7d2525c4a5f20fb7248366d42b4e4fb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 26 Feb 2025 11:41:15 +0000 Subject: [PATCH 049/108] io_uring/net: remove unnecessary REQ_F_NEED_CLEANUP REQ_F_NEED_CLEANUP in io_recvmsg_prep_setup() and in io_sendmsg_setup() are relics of the past and don't do anything useful, the flag should be and are set earlier on iovec and async_data allocation. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6aedc3141c1fc027128a4503656cfd686a6980ef.1740569495.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index bce75d64be922..c78edfd5085ee 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -388,14 +388,10 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; - int ret; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ret = io_sendmsg_copy_hdr(req, kmsg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return io_sendmsg_copy_hdr(req, kmsg); } #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) @@ -774,10 +770,7 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) return 0; } - ret = io_recvmsg_copy_hdr(req, kmsg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return io_recvmsg_copy_hdr(req, kmsg); } #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ From 0fc5a589aff7f9e613ff94fd3dd0a6686ffcb706 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 26 Feb 2025 11:41:16 +0000 Subject: [PATCH 050/108] io_uring/net: simplify compat selbuf iov parsing Use copy_from_user() instead of open coded access_ok() + get_user(), that's simpler and we don't care about compat that much. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e51f9c323a3cd4ad7c8da656559bdf6237f052fb.1740569495.git.asml.silence@gmail.com [axboe: fold in bogus < 0 check for tmp_iov.iov_len] Signed-off-by: Jens Axboe --- io_uring/net.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index c78edfd5085ee..dc15698647995 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -215,21 +215,17 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, uiov = compat_ptr(msg->msg_iov); if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; - if (msg->msg_iovlen == 0) { sr->len = iov->iov_len = 0; iov->iov_base = NULL; } else if (msg->msg_iovlen > 1) { return -EINVAL; } else { - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) + struct compat_iovec tmp_iov; + + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; + sr->len = tmp_iov.iov_len; } return 0; From a223e96f7305fc482b4b41424e53d12ef693793d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 26 Feb 2025 11:41:17 +0000 Subject: [PATCH 051/108] io_uring/net: isolate msghdr copying code The user access section in io_msg_copy_hdr() is overextended by covering selected buffers. It's hard to work with and prone to errors. Limit the section to msghdr import only, selected buffers will do a separate copy_from_user() call, and then move it into its own function. This should be fine, selected buffer single shots are not important, for multishots the overhead should be non-existent, and it's not that expensive overall. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d3eb1f81c8cfbea9f1aa57dab90c472d2aa6e371.1740569495.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index dc15698647995..1e176f48c268c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -241,6 +241,24 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, } #endif +static int io_copy_msghdr_from_user(struct user_msghdr *msg, + struct user_msghdr __user *umsg) +{ + if (!user_access_begin(umsg, sizeof(*umsg))) + return -EFAULT; + unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); + unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); + unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); + unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); + unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); + unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); + user_access_end(); + return 0; +ua_end: + user_access_end(); + return -EFAULT; +} + static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct user_msghdr *msg, int ddir) { @@ -257,16 +275,10 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, nr_segs = 1; } - if (!user_access_begin(umsg, sizeof(*umsg))) - return -EFAULT; + ret = io_copy_msghdr_from_user(msg, umsg); + if (unlikely(ret)) + return ret; - ret = -EFAULT; - unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); - unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); - unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); - unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); - unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); - unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); msg->msg_flags = 0; if (req->flags & REQ_F_BUFFER_SELECT) { @@ -274,24 +286,17 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, sr->len = iov->iov_len = 0; iov->iov_base = NULL; } else if (msg->msg_iovlen > 1) { - ret = -EINVAL; - goto ua_end; + return -EINVAL; } else { struct iovec __user *uiov = msg->msg_iov; - /* we only need the length for provided buffers */ - if (!access_ok(&uiov->iov_len, sizeof(uiov->iov_len))) - goto ua_end; - unsafe_get_user(iov->iov_len, &uiov->iov_len, ua_end); + if (copy_from_user(iov, uiov, sizeof(*iov))) + return -EFAULT; sr->len = iov->iov_len; } - ret = 0; -ua_end: - user_access_end(); - return ret; + return 0; } - user_access_end(); ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, &iov, &iomsg->msg.msg_iter, false); if (unlikely(ret < 0)) From 00a9143d9872d9913a14ef1116b140ff346acd3e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 26 Feb 2025 11:41:18 +0000 Subject: [PATCH 052/108] io_uring/net: verify msghdr before copying iovec Normally, net/ would verify msghdr before importing iovec, for example see copy_msghdr_from_user(), which further assumed by __copy_msghdr() validating msg->msg_iovlen. io_uring does it in reverse order, which is fine, but it'll be more convenient for flip it so that the iovec business is done at the end and eventually can be nicely pulled out of msghdr parsing section and thought as a sepaarate step. That also makes structure accesses more localised, which should be better for caches. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cd35dc1b48d4e6e31f59ae7304c037fbe8a3fd3d.1740569495.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 1e176f48c268c..c362dd6fc8901 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -195,7 +195,8 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, #ifdef CONFIG_COMPAT static int io_compat_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, - struct compat_msghdr *msg, int ddir) + struct compat_msghdr *msg, int ddir, + struct sockaddr __user **save_addr) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct compat_iovec __user *uiov; @@ -213,6 +214,10 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) return -EFAULT; + ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); + if (ret) + return ret; + uiov = compat_ptr(msg->msg_iov); if (req->flags & REQ_F_BUFFER_SELECT) { if (msg->msg_iovlen == 0) { @@ -260,7 +265,8 @@ static int io_copy_msghdr_from_user(struct user_msghdr *msg, } static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, - struct user_msghdr *msg, int ddir) + struct user_msghdr *msg, int ddir, + struct sockaddr __user **save_addr) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct user_msghdr __user *umsg = sr->umsg; @@ -281,6 +287,10 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, msg->msg_flags = 0; + ret = __copy_msghdr(&iomsg->msg, msg, save_addr); + if (ret) + return ret; + if (req->flags & REQ_F_BUFFER_SELECT) { if (msg->msg_iovlen == 0) { sr->len = iov->iov_len = 0; @@ -320,22 +330,14 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, if (io_is_compat(req->ctx)) { struct compat_msghdr cmsg; - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); - if (unlikely(ret)) - return ret; - - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE, + NULL); sr->msg_control = iomsg->msg.msg_control_user; return ret; } #endif - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); - if (unlikely(ret)) - return ret; - - ret = __copy_msghdr(&iomsg->msg, &msg, NULL); - + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = iomsg->msg.msg_control_user; return ret; @@ -717,27 +719,18 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, if (io_is_compat(req->ctx)) { struct compat_msghdr cmsg; - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST, + &iomsg->uaddr); if (unlikely(ret)) return ret; - - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); - if (unlikely(ret)) - return ret; - return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, cmsg.msg_controllen); } #endif - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); if (unlikely(ret)) return ret; - - ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); - if (unlikely(ret)) - return ret; - return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, msg.msg_controllen); } From 0c623f489987c76726b8ec0e089a89df53caea3b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 26 Feb 2025 11:41:19 +0000 Subject: [PATCH 053/108] io_uring/net: derive iovec storage later Don't read free_iov until right before we need it to import the iovec. The only place that uses it before that is provided buffer selection, but it only serves as temporary storage and iovec content is not reused afterwards, so use a local variable for that. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8bfa7d74c33e37860a724f4e0e96660c25cd4c02.1740569495.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index c362dd6fc8901..36e218b62319c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -203,14 +203,6 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, struct iovec *iov; int ret, nr_segs; - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; - } else { - iov = &iomsg->fast_iov; - nr_segs = 1; - } - if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) return -EFAULT; @@ -221,8 +213,7 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, uiov = compat_ptr(msg->msg_iov); if (req->flags & REQ_F_BUFFER_SELECT) { if (msg->msg_iovlen == 0) { - sr->len = iov->iov_len = 0; - iov->iov_base = NULL; + sr->len = 0; } else if (msg->msg_iovlen > 1) { return -EINVAL; } else { @@ -236,6 +227,14 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, return 0; } + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; + } + ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, nr_segs, &iov, &iomsg->msg.msg_iter, true); if (unlikely(ret < 0)) @@ -273,14 +272,6 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct iovec *iov; int ret, nr_segs; - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; - } else { - iov = &iomsg->fast_iov; - nr_segs = 1; - } - ret = io_copy_msghdr_from_user(msg, umsg); if (unlikely(ret)) return ret; @@ -293,20 +284,28 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, if (req->flags & REQ_F_BUFFER_SELECT) { if (msg->msg_iovlen == 0) { - sr->len = iov->iov_len = 0; - iov->iov_base = NULL; + sr->len = 0; } else if (msg->msg_iovlen > 1) { return -EINVAL; } else { struct iovec __user *uiov = msg->msg_iov; + struct iovec tmp_iov; - if (copy_from_user(iov, uiov, sizeof(*iov))) + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) return -EFAULT; - sr->len = iov->iov_len; + sr->len = tmp_iov.iov_len; } return 0; } + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; + } + ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, &iov, &iomsg->msg.msg_iter, false); if (unlikely(ret < 0)) From 51e158d40589f3576c598e67a741d398d1ca2189 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 26 Feb 2025 11:41:20 +0000 Subject: [PATCH 054/108] io_uring/net: unify *mshot_prep calls with compat Instead of duplicating a io_recvmsg_mshot_prep() call in the compat path, let the common code handle it. For that, copy necessary compat fields into struct user_msghdr. Note, it zeroes user_msghdr to be on the safe side as compat is not that interesting and overhead shouldn't be high. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/94e62386dec570f83b4a4270a46ac60bc415fb71.1740569495.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 36e218b62319c..efa2d901762f5 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -714,20 +714,20 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, iomsg->msg.msg_name = &iomsg->addr; iomsg->msg.msg_iter.nr_segs = 0; -#ifdef CONFIG_COMPAT if (io_is_compat(req->ctx)) { +#ifdef CONFIG_COMPAT struct compat_msghdr cmsg; ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST, &iomsg->uaddr); - if (unlikely(ret)) - return ret; - return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, - cmsg.msg_controllen); - } + memset(&msg, 0, sizeof(msg)); + msg.msg_namelen = cmsg.msg_namelen; + msg.msg_controllen = cmsg.msg_controllen; #endif + } else { + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); + } - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); if (unlikely(ret)) return ret; return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, From 5ee6e3ea31fcaa8e27d6e44c2739d98c229cabbc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 26 Feb 2025 11:41:21 +0000 Subject: [PATCH 055/108] io_uring/net: extract iovec import into a helper Deduplicate iovec imports between compat and !compat by introducing a helper function. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6a5f8c526f6732c4249a7fa0213b49e1a3ecccf0.1740569495.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 62 +++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index efa2d901762f5..4ef94bf88cb4d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -192,6 +192,29 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->buf_index = sr->buf_group; } +static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, + const struct iovec __user *uiov, unsigned uvec_seg, + int ddir) +{ + struct iovec *iov; + int ret, nr_segs; + + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + nr_segs = 1; + iov = &iomsg->fast_iov; + } + + ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, + &iomsg->msg.msg_iter, io_is_compat(req->ctx)); + if (unlikely(ret < 0)) + return ret; + io_net_vec_assign(req, iomsg, iov); + return 0; +} + #ifdef CONFIG_COMPAT static int io_compat_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, @@ -200,8 +223,7 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct compat_iovec __user *uiov; - struct iovec *iov; - int ret, nr_segs; + int ret; if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) return -EFAULT; @@ -227,21 +249,8 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, return 0; } - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; - } else { - iov = &iomsg->fast_iov; - nr_segs = 1; - } - - ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, - nr_segs, &iov, &iomsg->msg.msg_iter, true); - if (unlikely(ret < 0)) - return ret; - - io_net_vec_assign(req, iomsg, iov); - return 0; + return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov, + msg->msg_iovlen, ddir); } #endif @@ -269,8 +278,7 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct user_msghdr __user *umsg = sr->umsg; - struct iovec *iov; - int ret, nr_segs; + int ret; ret = io_copy_msghdr_from_user(msg, umsg); if (unlikely(ret)) @@ -298,21 +306,7 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, return 0; } - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; - } else { - iov = &iomsg->fast_iov; - nr_segs = 1; - } - - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, - &iov, &iomsg->msg.msg_iter, false); - if (unlikely(ret < 0)) - return ret; - - io_net_vec_assign(req, iomsg, iov); - return 0; + return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir); } static int io_sendmsg_copy_hdr(struct io_kiocb *req, From 0fea2c4509a7aacb32445d12516abf01c012c806 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 26 Feb 2025 20:46:34 +0000 Subject: [PATCH 056/108] io_uring: rearrange opdef flags by use pattern Keep all flags that we use in the generic req init path close together. That saves a load for x86 because apparently some compilers prefer reading single bytes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ef03b6ce4a0c2a5234cd4037fa07e9e4902dcc9e.1740602793.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/opdef.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 14456436ff74a..719a52104abed 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -7,6 +7,12 @@ struct io_issue_def { unsigned needs_file : 1; /* should block plug */ unsigned plug : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -15,14 +21,8 @@ struct io_issue_def { unsigned pollin : 1; unsigned pollout : 1; unsigned poll_exclusive : 1; - /* op supports buffer selection */ - unsigned buffer_select : 1; /* skip auditing */ unsigned audit_skip : 1; - /* supports ioprio */ - unsigned ioprio : 1; - /* supports iopoll */ - unsigned iopoll : 1; /* have to be put into the iopoll list */ unsigned iopoll_queue : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ From 4afc332bc86c34b74f1211650f748feb6942a9cc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 27 Feb 2025 14:20:09 +0100 Subject: [PATCH 057/108] io_uring/net: fix build warning for !CONFIG_COMPAT A code rework resulted in an uninitialized return code when COMPAT mode is disabled: io_uring/net.c:722:6: error: variable 'ret' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized] 722 | if (io_is_compat(req->ctx)) { | ^~~~~~~~~~~~~~~~~~~~~~ io_uring/net.c:736:15: note: uninitialized use occurs here 736 | if (unlikely(ret)) | ^~~ Since io_is_compat() turns into a compile-time 'false', the #ifdef here is completely unnecessary, and removing it avoids the warning. Fixes: 51e158d40589 ("io_uring/net: unify *mshot_prep calls with compat") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250227132018.1111094-1-arnd@kernel.org Signed-off-by: Jens Axboe --- io_uring/net.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 4ef94bf88cb4d..6d13d378358b7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -215,7 +215,6 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg return 0; } -#ifdef CONFIG_COMPAT static int io_compat_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct compat_msghdr *msg, int ddir, @@ -252,7 +251,6 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov, msg->msg_iovlen, ddir); } -#endif static int io_copy_msghdr_from_user(struct user_msghdr *msg, struct user_msghdr __user *umsg) @@ -319,7 +317,6 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, iomsg->msg.msg_name = &iomsg->addr; iomsg->msg.msg_iter.nr_segs = 0; -#ifdef CONFIG_COMPAT if (io_is_compat(req->ctx)) { struct compat_msghdr cmsg; @@ -328,7 +325,6 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, sr->msg_control = iomsg->msg.msg_control_user; return ret; } -#endif ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); /* save msg_control as sys_sendmsg() overwrites it */ @@ -709,7 +705,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, iomsg->msg.msg_iter.nr_segs = 0; if (io_is_compat(req->ctx)) { -#ifdef CONFIG_COMPAT struct compat_msghdr cmsg; ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST, @@ -717,7 +712,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, memset(&msg, 0, sizeof(msg)); msg.msg_namelen = cmsg.msg_namelen; msg.msg_controllen = cmsg.msg_controllen; -#endif } else { ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); } From 2a61e63891add7817e35a2323347ed8d354acf84 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:11 -0800 Subject: [PATCH 058/108] io_uring/rw: move buffer_select outside generic prep Cleans up the generic rw prep to not require the do_import flag. Use a different prep function for callers that might need buffer select. Based-on-a-patch-by: Jens Axboe Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-2-kbusch@meta.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- io_uring/rw.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 788f06fbd7db1..b21b423b3cf8f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -248,8 +248,8 @@ static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, return ret; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, - int ddir, bool do_import) +static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; @@ -285,14 +285,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - if (do_import && !io_do_buffer_select(req)) { - struct io_async_rw *io = req->async_data; - - ret = io_import_rw_buffer(ddir, req, io, 0); - if (unlikely(ret)) - return ret; - } - attr_type_mask = READ_ONCE(sqe->attr_type_mask); if (attr_type_mask) { u64 attr_ptr; @@ -307,26 +299,45 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static int io_rw_do_import(struct io_kiocb *req, int ddir) +{ + if (io_do_buffer_select(req)) + return 0; + + return io_import_rw_buffer(ddir, req, req->async_data, 0); +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ddir); + if (unlikely(ret)) + return ret; + + return io_rw_do_import(req, ddir); +} + int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw(req, sqe, ITER_DEST, true); + return io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw(req, sqe, ITER_SOURCE, true); + return io_prep_rw(req, sqe, ITER_SOURCE); } static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { - const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); int ret; - ret = io_prep_rw(req, sqe, ddir, do_import); + ret = io_prep_rw(req, sqe, ddir); if (unlikely(ret)) return ret; - if (do_import) + if (!(req->flags & REQ_F_BUFFER_SELECT)) return 0; /* @@ -353,7 +364,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe struct io_async_rw *io; int ret; - ret = io_prep_rw(req, sqe, ddir, false); + ret = __io_prep_rw(req, sqe, ddir); if (unlikely(ret)) return ret; @@ -386,7 +397,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - ret = io_prep_rw(req, sqe, ITER_DEST, false); + ret = __io_prep_rw(req, sqe, ITER_DEST); if (unlikely(ret)) return ret; From ff92d824d0b55e35ed2ee77021cbd2ed3e7ae7a2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:12 -0800 Subject: [PATCH 059/108] io_uring/rw: move fixed buffer import to issue path Registered buffers may depend on a linked command, which makes the prep path too early to import. Move to the issue path when the node is actually needed like all the other users of fixed buffers. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-3-kbusch@meta.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- io_uring/opdef.c | 4 ++-- io_uring/rw.c | 39 ++++++++++++++++++++++++++++++--------- io_uring/rw.h | 2 ++ 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index e8baef4e51465..306fd9c48b441 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -104,7 +104,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read_fixed, - .issue = io_read, + .issue = io_read_fixed, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -118,7 +118,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write_fixed, - .issue = io_write, + .issue = io_write_fixed, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, diff --git a/io_uring/rw.c b/io_uring/rw.c index b21b423b3cf8f..7bc23802a388e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -357,31 +357,30 @@ int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_prep_rwv(req, sqe, ITER_SOURCE); } -static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, +static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_async_rw *io; + struct io_async_rw *io = req->async_data; int ret; - ret = __io_prep_rw(req, sqe, ddir); - if (unlikely(ret)) - return ret; + if (io->bytes_done) + return 0; - io = req->async_data; - ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, 0); + ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, + issue_flags); iov_iter_save_state(&io->iter, &io->iter_state); return ret; } int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw_fixed(req, sqe, ITER_DEST); + return __io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw_fixed(req, sqe, ITER_SOURCE); + return __io_prep_rw(req, sqe, ITER_SOURCE); } /* @@ -1147,6 +1146,28 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) } } +int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + ret = io_init_rw_fixed(req, issue_flags, ITER_DEST); + if (unlikely(ret)) + return ret; + + return io_read(req, issue_flags); +} + +int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE); + if (unlikely(ret)) + return ret; + + return io_write(req, issue_flags); +} + void io_rw_fail(struct io_kiocb *req) { int res; diff --git a/io_uring/rw.h b/io_uring/rw.h index a45e0c71b59d6..bf121b81ebe84 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -38,6 +38,8 @@ int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); int io_write(struct io_kiocb *req, unsigned int issue_flags); +int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags); +int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); From 99fde895ff56ac2241e7b7b4566731d72f2fdaa7 Mon Sep 17 00:00:00 2001 From: Xinyu Zhang Date: Thu, 27 Feb 2025 14:39:13 -0800 Subject: [PATCH 060/108] nvme: map uring_cmd data even if address is 0 When using kernel registered bvec fixed buffers, the "address" is actually the offset into the bvec rather than userspace address. Therefore it can be 0. We can skip checking whether the address is NULL before mapping uring_cmd data. Bad userspace address will be handled properly later when the user buffer is imported. With this patch, we will be able to use the kernel registered bvec fixed buffers in io_uring NVMe passthru with ublk zero-copy support. Reviewed-by: Caleb Sander Mateos Reviewed-by: Jens Axboe Reviewed-by: Ming Lei Signed-off-by: Xinyu Zhang Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-4-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e0876bc9aacde..fe9fb80c6a144 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -513,7 +513,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return PTR_ERR(req); req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; - if (d.addr && d.data_len) { + if (d.data_len) { ret = nvme_map_user_request(req, d.addr, d.data_len, nvme_to_user_ptr(d.metadata), d.metadata_len, ioucmd, vec, issue_flags); From 27cb27b6d5ea401143ca3648983342bb820c4be9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:14 -0800 Subject: [PATCH 061/108] io_uring: add support for kernel registered bvecs Provide an interface for the kernel to leverage the existing pre-registered buffers that io_uring provides. User space can reference these later to achieve zero-copy IO. User space must register an empty fixed buffer table with io_uring in order for the kernel to make use of it. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-5-kbusch@meta.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 7 ++ io_uring/io_uring.c | 3 + io_uring/rsrc.c | 123 +++++++++++++++++++++++++++++++++-- io_uring/rsrc.h | 9 +++ io_uring/rw.c | 3 + 5 files changed, 138 insertions(+), 7 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 87150dc0a07cf..cf8d80d847344 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -4,6 +4,7 @@ #include #include +#include /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ #define IORING_URING_CMD_CANCELABLE (1U << 30) @@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur return cmd_to_io_kiocb(cmd)->async_data; } +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags); +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); + #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9f50f7e0b57e5..f2bdb1eab5770 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3940,6 +3940,9 @@ static int __init io_uring_init(void) io_uring_optable_init(); + /* imu->dir is u8 */ + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 6cf7dba44d5b1..bf4e2c7897e5f 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -101,17 +102,23 @@ static int io_buffer_validate(struct iovec *iov) return 0; } -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) +static void io_release_ubuf(void *priv) { - struct io_mapped_ubuf *imu = node->buf; + struct io_mapped_ubuf *imu = priv; unsigned int i; - if (!refcount_dec_and_test(&imu->refs)) - return; for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); +} + +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) +{ + if (!refcount_dec_and_test(&imu->refs)) + return; + if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); + imu->release(imu->priv); kvfree(imu); } @@ -451,7 +458,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) break; case IORING_RSRC_BUFFER: if (node->buf) - io_buffer_unmap(ctx, node); + io_buffer_unmap(ctx, node->buf); break; default: WARN_ON_ONCE(1); @@ -761,6 +768,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; + imu->release = io_release_ubuf; + imu->priv = imu; + imu->is_kbuf = false; + imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); @@ -857,6 +868,95 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; + struct bio_vec bv, *bvec; + u16 nr_bvecs; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + if (data->nodes[index]) { + ret = -EBUSY; + goto unlock; + } + + node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + if (!node) { + ret = -ENOMEM; + goto unlock; + } + + nr_bvecs = blk_rq_nr_phys_segments(rq); + imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); + if (!imu) { + kfree(node); + ret = -ENOMEM; + goto unlock; + } + + imu->ubuf = 0; + imu->len = blk_rq_bytes(rq); + imu->acct_pages = 0; + imu->folio_shift = PAGE_SHIFT; + imu->nr_bvecs = nr_bvecs; + refcount_set(&imu->refs, 1); + imu->release = release; + imu->priv = rq; + imu->is_kbuf = true; + + if (op_is_write(req_op(rq))) + imu->dir = IO_IMU_SOURCE; + else + imu->dir = IO_IMU_DEST; + + bvec = imu->bvec; + rq_for_each_bvec(bv, rq, rq_iter) + *bvec++ = bv; + + node->buf = imu; + data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); + +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct io_rsrc_node *node; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) + goto unlock; + index = array_index_nospec(index, data->nr); + + node = data->nodes[index]; + if (!node || !node->buf->is_kbuf) + goto unlock; + + io_put_rsrc_node(ctx, node); + data->nodes[index] = NULL; +unlock: + io_ring_submit_unlock(ctx, issue_flags); +} +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); + static int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) @@ -871,6 +971,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; /* * Might not be a start of buffer, set size appropriately @@ -883,8 +985,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the @@ -898,8 +1000,15 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, */ const struct bio_vec *bvec = imu->bvec; + /* + * Kernel buffer bvecs, on the other hand, don't necessarily + * have the size property of user registered ones, so we have + * to use the slow iter advance. + */ if (offset < bvec->bv_len) { iter->iov_offset = offset; + } else if (imu->is_kbuf) { + iov_iter_advance(iter, offset); } else { unsigned long seg_skip; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 0fc07897070c2..6b61f3f8cce77 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -20,6 +20,11 @@ struct io_rsrc_node { }; }; +enum { + IO_IMU_DEST = 1 << ITER_DEST, + IO_IMU_SOURCE = 1 << ITER_SOURCE, +}; + struct io_mapped_ubuf { u64 ubuf; unsigned int len; @@ -27,6 +32,10 @@ struct io_mapped_ubuf { unsigned int folio_shift; refcount_t refs; unsigned long acct_pages; + void (*release)(void *); + void *priv; + bool is_kbuf; + u8 dir; struct bio_vec bvec[] __counted_by(nr_bvecs); }; diff --git a/io_uring/rw.c b/io_uring/rw.c index 7bc23802a388e..5ee9f8949e8ba 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -629,6 +629,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) */ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) { + struct io_kiocb *req = cmd_to_io_kiocb(rw); struct kiocb *kiocb = &rw->kiocb; struct file *file = kiocb->ki_filp; ssize_t ret = 0; @@ -644,6 +645,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if ((kiocb->ki_flags & IOCB_NOWAIT) && !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; + if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) + return -EFAULT; ppos = io_kiocb_ppos(kiocb); From 1f6540e2aabb7372e68223a3669019589c3e30ad Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:15 -0800 Subject: [PATCH 062/108] ublk: zc register/unregister bvec Provide new operations for the user to request mapping an active request to an io uring instance's buf_table. The user has to provide the index it wants to install the buffer. A reference count is taken on the request to ensure it can't be completed while it is active in a ring's buf_table. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-6-kbusch@meta.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++----- include/uapi/linux/ublk_cmd.h | 4 +++ 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 529085181f355..b5cf92baaf0f0 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -51,6 +51,9 @@ /* private ioctl command mirror */ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) +#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) +#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ @@ -196,12 +199,14 @@ struct ublk_params_header { static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + struct ublk_queue *ubq, int tag, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag); static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) { - return ub->dev_info.flags & UBLK_F_USER_COPY; + return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); } static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) @@ -581,7 +586,7 @@ static void ublk_apply_params(struct ublk_device *ub) static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) { - return ubq->flags & UBLK_F_USER_COPY; + return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); } static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) @@ -1747,6 +1752,45 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, io_uring_cmd_mark_cancelable(cmd, issue_flags); } +static void ublk_io_release(void *priv) +{ + struct request *rq = priv; + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + + ublk_put_req_ref(ubq, rq); +} + +static int ublk_register_io_buf(struct io_uring_cmd *cmd, + struct ublk_queue *ubq, unsigned int tag, + const struct ublksrv_io_cmd *ub_cmd, + unsigned int issue_flags) +{ + struct ublk_device *ub = cmd->file->private_data; + int index = (int)ub_cmd->addr, ret; + struct request *req; + + req = __ublk_check_and_get_req(ub, ubq, tag, 0); + if (!req) + return -EINVAL; + + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, + issue_flags); + if (ret) { + ublk_put_req_ref(ubq, req); + return ret; + } + + return 0; +} + +static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, + const struct ublksrv_io_cmd *ub_cmd, + unsigned int issue_flags) +{ + io_buffer_unregister_bvec(cmd, ub_cmd->addr, issue_flags); + return 0; +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) @@ -1798,6 +1842,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ret = -EINVAL; switch (_IOC_NR(cmd_op)) { + case UBLK_IO_REGISTER_IO_BUF: + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd, issue_flags); + case UBLK_IO_UNREGISTER_IO_BUF: + return ublk_unregister_io_buf(cmd, ub_cmd, issue_flags); case UBLK_IO_FETCH_REQ: /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ if (ublk_queue_ready(ubq)) { @@ -2459,7 +2507,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) * buffer by pwrite() to ublk char device, which can't be * used for unprivileged device */ - if (info.flags & UBLK_F_USER_COPY) + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) return -EINVAL; } @@ -2527,9 +2575,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_free_dev_number; } - /* We are not ready to support zero copy */ - ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; - ub->dev_info.nr_hw_queues = min_t(unsigned int, ub->dev_info.nr_hw_queues, nr_cpu_ids); ublk_align_max_io_size(ub); @@ -2860,7 +2905,7 @@ static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) { const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; - u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; + u64 features = UBLK_F_ALL; if (header->len != UBLK_FEATURES_LEN || !header->addr) return -EINVAL; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a8bc98bb69fce..74246c926b55f 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -94,6 +94,10 @@ _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) #define UBLK_U_IO_NEED_GET_DATA \ _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) +#define UBLK_U_IO_REGISTER_IO_BUF \ + _IOWR('u', 0x23, struct ublksrv_io_cmd) +#define UBLK_U_IO_UNREGISTER_IO_BUF \ + _IOWR('u', 0x24, struct ublksrv_io_cmd) /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 From ed9f3112a8a8f6e6919d3b9da2651fa302df7be3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:16 -0800 Subject: [PATCH 063/108] io_uring: cache nodes and mapped buffers Frequent alloc/free cycles on these is pretty costly. Use an io cache to more efficiently reuse these buffers. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-7-kbusch@meta.com [axboe: fix imu leak] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 + io_uring/filetable.c | 2 +- io_uring/io_uring.c | 2 + io_uring/rsrc.c | 71 +++++++++++++++++++++++++++------- io_uring/rsrc.h | 4 +- 5 files changed, 65 insertions(+), 16 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 123e693687305..432c98ff52ee9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -292,6 +292,8 @@ struct io_ring_ctx { struct io_file_table file_table; struct io_rsrc_data buf_table; + struct io_alloc_cache node_cache; + struct io_alloc_cache imu_cache; struct io_submit_state submit_state; diff --git a/io_uring/filetable.c b/io_uring/filetable.c index dd8eeec97acf6..a21660e3145ab 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, if (slot_index >= ctx->file_table.data.nr) return -EINVAL; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) return -ENOMEM; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f2bdb1eab5770..ccc343f61a573 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -291,6 +291,7 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->uring_cache, kfree); io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); + io_rsrc_cache_free(ctx); } static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) @@ -338,6 +339,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); + ret |= io_rsrc_cache_init(ctx); if (ret) goto free_ref; init_completion(&ctx->ref_comp); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index bf4e2c7897e5f..45bfb37bca1e6 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -33,6 +33,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) +#define IO_CACHED_BVECS_SEGS 32 + int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -111,6 +113,22 @@ static void io_release_ubuf(void *priv) unpin_user_page(imu->bvec[i].bv_page); } +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, + int nr_bvecs) +{ + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) + return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), + GFP_KERNEL); +} + +static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) +{ + if (imu->nr_bvecs > IO_CACHED_BVECS_SEGS || + !io_alloc_cache_put(&ctx->imu_cache, imu)) + kvfree(imu); +} + static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { if (!refcount_dec_and_test(&imu->refs)) @@ -119,22 +137,45 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); imu->release(imu->priv); - kvfree(imu); + io_free_imu(ctx, imu); } -struct io_rsrc_node *io_rsrc_node_alloc(int type) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) { struct io_rsrc_node *node; - node = kzalloc(sizeof(*node), GFP_KERNEL); + node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); if (node) { node->type = type; node->refs = 1; + node->tag = 0; + node->file_ptr = 0; } return node; } -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) +bool io_rsrc_cache_init(struct io_ring_ctx *ctx) +{ + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, + IO_CACHED_BVECS_SEGS); + const int node_size = sizeof(struct io_rsrc_node); + bool ret; + + ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, + node_size, 0); + ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, + imu_cache_size, 0); + return ret; +} + +void io_rsrc_cache_free(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->node_cache, kfree); + io_alloc_cache_free(&ctx->imu_cache, kfree); +} + +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, + struct io_rsrc_data *data) { if (!data->nr) return; @@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { err = -ENOMEM; fput(file); @@ -465,7 +506,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) break; } - kfree(node); + if (!io_alloc_cache_put(&ctx->node_cache, node)) + kvfree(node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -527,7 +569,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, goto fail; } ret = -ENOMEM; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { fput(file); goto fail; @@ -732,7 +774,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (!iov->iov_base) return NULL; - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) return ERR_PTR(-ENOMEM); node->buf = NULL; @@ -752,10 +794,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); } - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + imu = io_alloc_imu(ctx, nr_pages); if (!imu) goto done; + imu->nr_bvecs = nr_pages; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { unpin_user_pages(pages, nr_pages); @@ -766,7 +809,6 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->len = iov->iov_len; - imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; imu->release = io_release_ubuf; imu->priv = imu; @@ -789,7 +831,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, } done: if (ret) { - kvfree(imu); + if (imu) + io_free_imu(ctx, imu); if (node) io_put_rsrc_node(ctx, node); node = ERR_PTR(ret); @@ -893,14 +936,14 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, goto unlock; } - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) { ret = -ENOMEM; goto unlock; } nr_bvecs = blk_rq_nr_phys_segments(rq); - imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); + imu = io_alloc_imu(ctx, nr_bvecs); if (!imu) { kfree(node); ret = -ENOMEM; @@ -1137,7 +1180,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx if (!src_node) { dst_node = NULL; } else { - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!dst_node) { ret = -ENOMEM; goto out_free; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 6b61f3f8cce77..6fe7b9e615bf5 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -48,7 +48,9 @@ struct io_imu_folio_data { unsigned int nr_folios; }; -struct io_rsrc_node *io_rsrc_node_alloc(int type); +bool io_rsrc_cache_init(struct io_ring_ctx *ctx); +void io_rsrc_cache_free(struct io_ring_ctx *ctx); +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); From 6aecda00b7d1e187c31e702d607d2b51bbcddbcc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 1 Mar 2025 00:19:14 +0800 Subject: [PATCH 064/108] selftests: ublk: add kernel selftests for ublk Both ublk driver and userspace heavily depends on io_uring subsystem, and tools/testing/selftests/ should be the best place for holding this cross-subsystem tests. Add basic read/write IO test over this ublk null disk, and make sure ublk working. More tests will be added. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250228161919.2869102-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + tools/testing/selftests/ublk/.gitignore | 3 + tools/testing/selftests/ublk/Makefile | 12 + tools/testing/selftests/ublk/config | 1 + tools/testing/selftests/ublk/kublk.c | 1081 ++++++++++++++++++ tools/testing/selftests/ublk/kublk.h | 252 ++++ tools/testing/selftests/ublk/null.c | 38 + tools/testing/selftests/ublk/test_common.sh | 58 + tools/testing/selftests/ublk/test_null_01.sh | 19 + 10 files changed, 1466 insertions(+) create mode 100644 tools/testing/selftests/ublk/.gitignore create mode 100644 tools/testing/selftests/ublk/Makefile create mode 100644 tools/testing/selftests/ublk/config create mode 100644 tools/testing/selftests/ublk/kublk.c create mode 100644 tools/testing/selftests/ublk/kublk.h create mode 100644 tools/testing/selftests/ublk/null.c create mode 100755 tools/testing/selftests/ublk/test_common.sh create mode 100755 tools/testing/selftests/ublk/test_null_01.sh diff --git a/MAINTAINERS b/MAINTAINERS index efee40ea589f7..7dd2fbfc8604d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24237,6 +24237,7 @@ S: Maintained F: Documentation/block/ublk.rst F: drivers/block/ublk_drv.c F: include/uapi/linux/ublk_cmd.h +F: tools/testing/selftests/ublk/ UBSAN M: Kees Cook diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 8daac70c2f9d2..52ba0eb5eaa7c 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -113,6 +113,7 @@ endif TARGETS += tmpfs TARGETS += tpm2 TARGETS += tty +TARGETS += ublk TARGETS += uevent TARGETS += user_events TARGETS += vDSO diff --git a/tools/testing/selftests/ublk/.gitignore b/tools/testing/selftests/ublk/.gitignore new file mode 100644 index 0000000000000..8b2871ea77511 --- /dev/null +++ b/tools/testing/selftests/ublk/.gitignore @@ -0,0 +1,3 @@ +kublk +/tools +*-verify.state diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile new file mode 100644 index 0000000000000..b6ac306210914 --- /dev/null +++ b/tools/testing/selftests/ublk/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir) +LDLIBS += -lpthread -lm -luring + +TEST_PROGS := test_null_01.sh + +TEST_GEN_PROGS_EXTENDED = kublk + +include ../lib.mk + +$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c diff --git a/tools/testing/selftests/ublk/config b/tools/testing/selftests/ublk/config new file mode 100644 index 0000000000000..592b0ba4d6618 --- /dev/null +++ b/tools/testing/selftests/ublk/config @@ -0,0 +1 @@ +CONFIG_BLK_DEV_UBLK=m diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c new file mode 100644 index 0000000000000..b2dfd35bc1572 --- /dev/null +++ b/tools/testing/selftests/ublk/kublk.c @@ -0,0 +1,1081 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: uring_cmd based ublk + */ + +#include "kublk.h" + +unsigned int ublk_dbg_mask = UBLK_LOG; +static const struct ublk_tgt_ops *tgt_ops_list[] = { + &null_tgt_ops, +}; + +static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) +{ + const struct ublk_tgt_ops *ops; + int i; + + if (name == NULL) + return NULL; + + for (i = 0; sizeof(tgt_ops_list) / sizeof(ops); i++) + if (strcmp(tgt_ops_list[i]->name, name) == 0) + return tgt_ops_list[i]; + return NULL; +} + +static inline int ublk_setup_ring(struct io_uring *r, int depth, + int cq_depth, unsigned flags) +{ + struct io_uring_params p; + + memset(&p, 0, sizeof(p)); + p.flags = flags | IORING_SETUP_CQSIZE; + p.cq_entries = cq_depth; + + return io_uring_queue_init_params(depth, r, &p); +} + +static void ublk_ctrl_init_cmd(struct ublk_dev *dev, + struct io_uring_sqe *sqe, + struct ublk_ctrl_cmd_data *data) +{ + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe); + + sqe->fd = dev->ctrl_fd; + sqe->opcode = IORING_OP_URING_CMD; + sqe->ioprio = 0; + + if (data->flags & CTRL_CMD_HAS_BUF) { + cmd->addr = data->addr; + cmd->len = data->len; + } + + if (data->flags & CTRL_CMD_HAS_DATA) + cmd->data[0] = data->data[0]; + + cmd->dev_id = info->dev_id; + cmd->queue_id = -1; + + ublk_set_sqe_cmd_op(sqe, data->cmd_op); + + io_uring_sqe_set_data(sqe, cmd); +} + +static int __ublk_ctrl_cmd(struct ublk_dev *dev, + struct ublk_ctrl_cmd_data *data) +{ + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + int ret = -EINVAL; + + sqe = io_uring_get_sqe(&dev->ring); + if (!sqe) { + ublk_err("%s: can't get sqe ret %d\n", __func__, ret); + return ret; + } + + ublk_ctrl_init_cmd(dev, sqe, data); + + ret = io_uring_submit(&dev->ring); + if (ret < 0) { + ublk_err("uring submit ret %d\n", ret); + return ret; + } + + ret = io_uring_wait_cqe(&dev->ring, &cqe); + if (ret < 0) { + ublk_err("wait cqe: %s\n", strerror(-ret)); + return ret; + } + io_uring_cqe_seen(&dev->ring, cqe); + + return cqe->res; +} + +static int ublk_ctrl_stop_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_CMD_STOP_DEV, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_start_dev(struct ublk_dev *dev, + int daemon_pid) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_START_DEV, + .flags = CTRL_CMD_HAS_DATA, + }; + + dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_add_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_ADD_DEV, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) &dev->dev_info, + .len = sizeof(struct ublksrv_ctrl_dev_info), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_del_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_DEL_DEV, + .flags = 0, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_get_info(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_GET_DEV_INFO, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) &dev->dev_info, + .len = sizeof(struct ublksrv_ctrl_dev_info), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_set_params(struct ublk_dev *dev, + struct ublk_params *params) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_SET_PARAMS, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) params, + .len = sizeof(*params), + }; + params->len = sizeof(*params); + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_get_params(struct ublk_dev *dev, + struct ublk_params *params) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_CMD_GET_PARAMS, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64)params, + .len = sizeof(*params), + }; + + params->len = sizeof(*params); + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_get_features(struct ublk_dev *dev, + __u64 *features) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_GET_FEATURES, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) features, + .len = sizeof(*features), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static const char *ublk_dev_state_desc(struct ublk_dev *dev) +{ + switch (dev->dev_info.state) { + case UBLK_S_DEV_DEAD: + return "DEAD"; + case UBLK_S_DEV_LIVE: + return "LIVE"; + case UBLK_S_DEV_QUIESCED: + return "QUIESCED"; + default: + return "UNKNOWN"; + }; +} + +static void ublk_ctrl_dump(struct ublk_dev *dev) +{ + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + struct ublk_params p; + int ret; + + ret = ublk_ctrl_get_params(dev, &p); + if (ret < 0) { + ublk_err("failed to get params %m\n"); + return; + } + + ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", + info->dev_id, info->nr_hw_queues, info->queue_depth, + 1 << p.basic.logical_bs_shift, p.basic.dev_sectors); + ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n", + info->max_io_buf_bytes, info->ublksrv_pid, info->flags, + ublk_dev_state_desc(dev)); + fflush(stdout); +} + +static void ublk_ctrl_deinit(struct ublk_dev *dev) +{ + close(dev->ctrl_fd); + free(dev); +} + +static struct ublk_dev *ublk_ctrl_init(void) +{ + struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev)); + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + int ret; + + dev->ctrl_fd = open(CTRL_DEV, O_RDWR); + if (dev->ctrl_fd < 0) { + free(dev); + return NULL; + } + + info->max_io_buf_bytes = UBLK_IO_MAX_BYTES; + + ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH, + UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128); + if (ret < 0) { + ublk_err("queue_init: %s\n", strerror(-ret)); + free(dev); + return NULL; + } + dev->nr_fds = 1; + + return dev; +} + +static int __ublk_queue_cmd_buf_sz(unsigned depth) +{ + int size = depth * sizeof(struct ublksrv_io_desc); + unsigned int page_sz = getpagesize(); + + return round_up(size, page_sz); +} + +static int ublk_queue_max_cmd_buf_sz(void) +{ + return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH); +} + +static int ublk_queue_cmd_buf_sz(struct ublk_queue *q) +{ + return __ublk_queue_cmd_buf_sz(q->q_depth); +} + +static void ublk_queue_deinit(struct ublk_queue *q) +{ + int i; + int nr_ios = q->q_depth; + + io_uring_unregister_ring_fd(&q->ring); + + if (q->ring.ring_fd > 0) { + io_uring_unregister_files(&q->ring); + close(q->ring.ring_fd); + q->ring.ring_fd = -1; + } + + if (q->io_cmd_buf) + munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); + + for (i = 0; i < nr_ios; i++) + free(q->ios[i].buf_addr); +} + +static int ublk_queue_init(struct ublk_queue *q) +{ + struct ublk_dev *dev = q->dev; + int depth = dev->dev_info.queue_depth; + int i, ret = -1; + int cmd_buf_size, io_buf_size; + unsigned long off; + int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; + + q->tgt_ops = dev->tgt.ops; + q->state = 0; + q->q_depth = depth; + q->cmd_inflight = 0; + q->tid = gettid(); + + cmd_buf_size = ublk_queue_cmd_buf_sz(q); + off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); + q->io_cmd_buf = (char *)mmap(0, cmd_buf_size, PROT_READ, + MAP_SHARED | MAP_POPULATE, dev->fds[0], off); + if (q->io_cmd_buf == MAP_FAILED) { + ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n", + q->dev->dev_info.dev_id, q->q_id); + goto fail; + } + + io_buf_size = dev->dev_info.max_io_buf_bytes; + for (i = 0; i < q->q_depth; i++) { + q->ios[i].buf_addr = NULL; + q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE; + + if (q->state & UBLKSRV_NO_BUF) + continue; + + if (posix_memalign((void **)&q->ios[i].buf_addr, + getpagesize(), io_buf_size)) { + ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n", + dev->dev_info.dev_id, q->q_id, i); + goto fail; + } + } + + ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth, + IORING_SETUP_COOP_TASKRUN); + if (ret < 0) { + ublk_err("ublk dev %d queue %d setup io_uring failed %d\n", + q->dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + + io_uring_register_ring_fd(&q->ring); + + ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds); + if (ret) { + ublk_err("ublk dev %d queue %d register files failed %d\n", + q->dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + + return 0; + fail: + ublk_queue_deinit(q); + ublk_err("ublk dev %d queue %d failed\n", + dev->dev_info.dev_id, q->q_id); + return -ENOMEM; +} + +static int ublk_dev_prep(struct ublk_dev *dev) +{ + int dev_id = dev->dev_info.dev_id; + char buf[64]; + int ret = 0; + + snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id); + dev->fds[0] = open(buf, O_RDWR); + if (dev->fds[0] < 0) { + ret = -EBADF; + ublk_err("can't open %s, ret %d\n", buf, dev->fds[0]); + goto fail; + } + + if (dev->tgt.ops->init_tgt) + ret = dev->tgt.ops->init_tgt(dev); + + return ret; +fail: + close(dev->fds[0]); + return ret; +} + +static void ublk_dev_unprep(struct ublk_dev *dev) +{ + if (dev->tgt.ops->deinit_tgt) + dev->tgt.ops->deinit_tgt(dev); + close(dev->fds[0]); +} + +int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) +{ + struct ublksrv_io_cmd *cmd; + struct io_uring_sqe *sqe; + unsigned int cmd_op = 0; + __u64 user_data; + + /* only freed io can be issued */ + if (!(io->flags & UBLKSRV_IO_FREE)) + return 0; + + /* we issue because we need either fetching or committing */ + if (!(io->flags & + (UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP))) + return 0; + + if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP) + cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; + else if (io->flags & UBLKSRV_NEED_FETCH_RQ) + cmd_op = UBLK_U_IO_FETCH_REQ; + + if (io_uring_sq_space_left(&q->ring) < 1) + io_uring_submit(&q->ring); + + sqe = ublk_queue_alloc_sqe(q); + if (!sqe) { + ublk_err("%s: run out of sqe %d, tag %d\n", + __func__, q->q_id, tag); + return -1; + } + + cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe); + + if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ) + cmd->result = io->result; + + /* These fields should be written once, never change */ + ublk_set_sqe_cmd_op(sqe, cmd_op); + sqe->fd = 0; /* dev->fds[0] */ + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags = IOSQE_FIXED_FILE; + sqe->rw_flags = 0; + cmd->tag = tag; + cmd->q_id = q->q_id; + if (!(q->state & UBLKSRV_NO_BUF)) + cmd->addr = (__u64) (uintptr_t) io->buf_addr; + else + cmd->addr = 0; + + user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0); + io_uring_sqe_set_data64(sqe, user_data); + + io->flags = 0; + + q->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n", + __func__, q->q_id, tag, cmd_op, + io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING)); + return 1; +} + +static void ublk_submit_fetch_commands(struct ublk_queue *q) +{ + int i = 0; + + for (i = 0; i < q->q_depth; i++) + ublk_queue_io_cmd(q, &q->ios[i], i); +} + +static int ublk_queue_is_idle(struct ublk_queue *q) +{ + return !io_uring_sq_ready(&q->ring) && !q->io_inflight; +} + +static int ublk_queue_is_done(struct ublk_queue *q) +{ + return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q); +} + +static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q, + struct io_uring_cqe *cqe) +{ + unsigned tag = user_data_to_tag(cqe->user_data); + + if (cqe->res < 0 && cqe->res != -EAGAIN) + ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n", + __func__, cqe->res, q->q_id, + user_data_to_tag(cqe->user_data), + user_data_to_op(cqe->user_data)); + + if (q->tgt_ops->tgt_io_done) + q->tgt_ops->tgt_io_done(q, tag, cqe); +} + +static void ublk_handle_cqe(struct io_uring *r, + struct io_uring_cqe *cqe, void *data) +{ + struct ublk_queue *q = container_of(r, struct ublk_queue, ring); + unsigned tag = user_data_to_tag(cqe->user_data); + unsigned cmd_op = user_data_to_op(cqe->user_data); + int fetch = (cqe->res != UBLK_IO_RES_ABORT) && + !(q->state & UBLKSRV_QUEUE_STOPPING); + struct ublk_io *io; + + if (cqe->res < 0 && cqe->res != -ENODEV) + ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, + cqe->res, cqe->user_data, q->state); + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d) stopping %d\n", + __func__, cqe->res, q->q_id, tag, cmd_op, + is_target_io(cqe->user_data), + (q->state & UBLKSRV_QUEUE_STOPPING)); + + /* Don't retrieve io in case of target io */ + if (is_target_io(cqe->user_data)) { + ublksrv_handle_tgt_cqe(q, cqe); + return; + } + + io = &q->ios[tag]; + q->cmd_inflight--; + + if (!fetch) { + q->state |= UBLKSRV_QUEUE_STOPPING; + io->flags &= ~UBLKSRV_NEED_FETCH_RQ; + } + + if (cqe->res == UBLK_IO_RES_OK) { + assert(tag < q->q_depth); + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(q, tag); + } else { + /* + * COMMIT_REQ will be completed immediately since no fetching + * piggyback is required. + * + * Marking IO_FREE only, then this io won't be issued since + * we only issue io with (UBLKSRV_IO_FREE | UBLKSRV_NEED_*) + * + * */ + io->flags = UBLKSRV_IO_FREE; + } +} + +static int ublk_reap_events_uring(struct io_uring *r) +{ + struct io_uring_cqe *cqe; + unsigned head; + int count = 0; + + io_uring_for_each_cqe(r, head, cqe) { + ublk_handle_cqe(r, cqe, NULL); + count += 1; + } + io_uring_cq_advance(r, count); + + return count; +} + +static int ublk_process_io(struct ublk_queue *q) +{ + int ret, reapped; + + ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n", + q->dev->dev_info.dev_id, + q->q_id, io_uring_sq_ready(&q->ring), + q->cmd_inflight, + (q->state & UBLKSRV_QUEUE_STOPPING)); + + if (ublk_queue_is_done(q)) + return -ENODEV; + + ret = io_uring_submit_and_wait(&q->ring, 1); + reapped = ublk_reap_events_uring(&q->ring); + + ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n", + ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING), + (q->state & UBLKSRV_QUEUE_IDLE)); + + return reapped; +} + +static void *ublk_io_handler_fn(void *data) +{ + struct ublk_queue *q = data; + int dev_id = q->dev->dev_info.dev_id; + int ret; + + ret = ublk_queue_init(q); + if (ret) { + ublk_err("ublk dev %d queue %d init queue failed\n", + dev_id, q->q_id); + return NULL; + } + ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n", + q->tid, dev_id, q->q_id); + + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(q); + do { + if (ublk_process_io(q) < 0) + break; + } while (1); + + ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id); + ublk_queue_deinit(q); + return NULL; +} + +static void ublk_set_parameters(struct ublk_dev *dev) +{ + int ret; + + ret = ublk_ctrl_set_params(dev, &dev->tgt.params); + if (ret) + ublk_err("dev %d set basic parameter failed %d\n", + dev->dev_info.dev_id, ret); +} + +static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id) +{ + uint64_t id; + int evtfd = ctx->_evtfd; + + if (evtfd < 0) + return -EBADF; + + if (dev_id >= 0) + id = dev_id + 1; + else + id = ERROR_EVTFD_DEVID; + + if (write(evtfd, &id, sizeof(id)) != sizeof(id)) + return -EINVAL; + + return 0; +} + + +static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + int ret, i; + void *thread_ret; + const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; + + ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); + + ret = ublk_dev_prep(dev); + if (ret) + return ret; + + for (i = 0; i < dinfo->nr_hw_queues; i++) { + dev->q[i].dev = dev; + dev->q[i].q_id = i; + pthread_create(&dev->q[i].thread, NULL, + ublk_io_handler_fn, + &dev->q[i]); + } + + /* everything is fine now, start us */ + ublk_set_parameters(dev); + ret = ublk_ctrl_start_dev(dev, getpid()); + if (ret < 0) { + ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); + goto fail; + } + + ublk_ctrl_get_info(dev); + ublk_send_dev_event(ctx, dev->dev_info.dev_id); + + /* wait until we are terminated */ + for (i = 0; i < dinfo->nr_hw_queues; i++) + pthread_join(dev->q[i].thread, &thread_ret); + fail: + ublk_dev_unprep(dev); + ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__); + + return ret; +} + +static int wait_ublk_dev(char *dev_name, int evt_mask, unsigned timeout) +{ +#define EV_SIZE (sizeof(struct inotify_event)) +#define EV_BUF_LEN (128 * (EV_SIZE + 16)) + struct pollfd pfd; + int fd, wd; + int ret = -EINVAL; + + fd = inotify_init(); + if (fd < 0) { + ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__); + return fd; + } + + wd = inotify_add_watch(fd, "/dev", evt_mask); + if (wd == -1) { + ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__); + goto fail; + } + + pfd.fd = fd; + pfd.events = POLL_IN; + while (1) { + int i = 0; + char buffer[EV_BUF_LEN]; + ret = poll(&pfd, 1, 1000 * timeout); + + if (ret == -1) { + ublk_err("%s: poll inotify failed: %d\n", __func__, ret); + goto rm_watch; + } else if (ret == 0) { + ublk_err("%s: poll inotify timeout\n", __func__); + ret = -ETIMEDOUT; + goto rm_watch; + } + + ret = read(fd, buffer, EV_BUF_LEN); + if (ret < 0) { + ublk_err("%s: read inotify fd failed\n", __func__); + goto rm_watch; + } + + while (i < ret) { + struct inotify_event *event = (struct inotify_event *)&buffer[i]; + + ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n", + __func__, event->mask, event->name); + if (event->mask & evt_mask) { + if (!strcmp(event->name, dev_name)) { + ret = 0; + goto rm_watch; + } + } + i += EV_SIZE + event->len; + } + } +rm_watch: + inotify_rm_watch(fd, wd); +fail: + close(fd); + return ret; +} + +static int ublk_stop_io_daemon(const struct ublk_dev *dev) +{ + int daemon_pid = dev->dev_info.ublksrv_pid; + int dev_id = dev->dev_info.dev_id; + char ublkc[64]; + int ret = 0; + + /* daemon may be dead already */ + if (kill(daemon_pid, 0) < 0) + goto wait; + + /* + * Wait until ublk char device is closed, when our daemon is shutdown + */ + snprintf(ublkc, sizeof(ublkc), "%s%d", "ublkc", dev_id); + ret = wait_ublk_dev(ublkc, IN_CLOSE_WRITE, 10); + /* double check and inotify may not be 100% reliable */ + if (ret == -ETIMEDOUT) + /* the daemon doesn't exist now if kill(0) fails */ + ret = kill(daemon_pid, 0) < 0; +wait: + waitpid(daemon_pid, NULL, 0); + ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n", + __func__, daemon_pid, dev_id, ret); + + return ret; +} + +static int __cmd_dev_add(const struct dev_ctx *ctx) +{ + unsigned nr_queues = ctx->nr_hw_queues; + const char *tgt_type = ctx->tgt_type; + unsigned depth = ctx->queue_depth; + __u64 features; + const struct ublk_tgt_ops *ops; + struct ublksrv_ctrl_dev_info *info; + struct ublk_dev *dev; + int dev_id = ctx->dev_id; + int ret; + + ops = ublk_find_tgt(tgt_type); + if (!ops) { + ublk_err("%s: no such tgt type, type %s\n", + __func__, tgt_type); + return -ENODEV; + } + + if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { + ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", + __func__, nr_queues, depth); + return -EINVAL; + } + + dev = ublk_ctrl_init(); + if (!dev) { + ublk_err("%s: can't alloc dev id %d, type %s\n", + __func__, dev_id, tgt_type); + return -ENOMEM; + } + + /* kernel doesn't support get_features */ + ret = ublk_ctrl_get_features(dev, &features); + if (ret < 0) + return -EINVAL; + + if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) + return -ENOTSUP; + + info = &dev->dev_info; + info->dev_id = ctx->dev_id; + info->nr_hw_queues = nr_queues; + info->queue_depth = depth; + info->flags = ctx->flags; + dev->tgt.ops = ops; + dev->tgt.sq_depth = depth; + dev->tgt.cq_depth = depth; + + ret = ublk_ctrl_add_dev(dev); + if (ret < 0) { + ublk_err("%s: can't add dev id %d, type %s ret %d\n", + __func__, dev_id, tgt_type, ret); + goto fail; + } + + ret = ublk_start_daemon(ctx, dev); + ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret); + +fail: + if (ret < 0) + ublk_send_dev_event(ctx, -1); + ublk_ctrl_deinit(dev); + return ret; +} + +static int __cmd_dev_list(struct dev_ctx *ctx); + +static int cmd_dev_add(struct dev_ctx *ctx) +{ + int res; + + ctx->_evtfd = eventfd(0, 0); + if (ctx->_evtfd < 0) { + ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno)); + exit(-1); + } + + setsid(); + res = fork(); + if (res == 0) { + __cmd_dev_add(ctx); + exit(EXIT_SUCCESS); + } else if (res > 0) { + uint64_t id; + + res = read(ctx->_evtfd, &id, sizeof(id)); + close(ctx->_evtfd); + if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) { + ctx->dev_id = id - 1; + return __cmd_dev_list(ctx); + } + exit(EXIT_FAILURE); + } else { + return res; + } +} + +static int __cmd_dev_del(struct dev_ctx *ctx) +{ + int number = ctx->dev_id; + struct ublk_dev *dev; + int ret; + + dev = ublk_ctrl_init(); + dev->dev_info.dev_id = number; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) + goto fail; + + ret = ublk_ctrl_stop_dev(dev); + if (ret < 0) + ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret); + + ret = ublk_stop_io_daemon(dev); + if (ret < 0) + ublk_err("%s: stop daemon id %d dev %d, ret %d\n", + __func__, dev->dev_info.ublksrv_pid, number, ret); + ublk_ctrl_del_dev(dev); +fail: + if (ret >= 0) + ret = ublk_ctrl_get_info(dev); + ublk_ctrl_deinit(dev); + + return (ret >= 0) ? 0 : ret; +} + +static int cmd_dev_del(struct dev_ctx *ctx) +{ + int i; + + if (ctx->dev_id >= 0 || !ctx->all) + return __cmd_dev_del(ctx); + + for (i = 0; i < 255; i++) { + ctx->dev_id = i; + __cmd_dev_del(ctx); + } + return 0; +} + +static int __cmd_dev_list(struct dev_ctx *ctx) +{ + struct ublk_dev *dev = ublk_ctrl_init(); + int ret; + + if (!dev) + return -ENODEV; + + dev->dev_info.dev_id = ctx->dev_id; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) { + if (ctx->logging) + ublk_err("%s: can't get dev info from %d: %d\n", + __func__, ctx->dev_id, ret); + } else { + ublk_ctrl_dump(dev); + } + + ublk_ctrl_deinit(dev); + + return ret; +} + +static int cmd_dev_list(struct dev_ctx *ctx) +{ + int i; + + if (ctx->dev_id >= 0 || !ctx->all) + return __cmd_dev_list(ctx); + + ctx->logging = false; + for (i = 0; i < 255; i++) { + ctx->dev_id = i; + __cmd_dev_list(ctx); + } + return 0; +} + +static int cmd_dev_get_features(void) +{ +#define const_ilog2(x) (63 - __builtin_clzll(x)) + static const char *feat_map[] = { + [const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY", + [const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK", + [const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA", + [const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY", + [const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE", + [const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV", + [const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE", + [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY", + [const_ilog2(UBLK_F_ZONED)] = "ZONED", + [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO", + }; + struct ublk_dev *dev; + __u64 features = 0; + int ret; + + dev = ublk_ctrl_init(); + if (!dev) { + fprintf(stderr, "ublksrv_ctrl_init failed id\n"); + return -EOPNOTSUPP; + } + + ret = ublk_ctrl_get_features(dev, &features); + if (!ret) { + int i; + + printf("ublk_drv features: 0x%llx\n", features); + + for (i = 0; i < sizeof(features) * 8; i++) { + const char *feat; + + if (!((1ULL << i) & features)) + continue; + if (i < sizeof(feat_map) / sizeof(feat_map[0])) + feat = feat_map[i]; + else + feat = "unknown"; + printf("\t%-20s: 0x%llx\n", feat, 1ULL << i); + } + } + + return ret; +} + +static int cmd_dev_help(char *exe) +{ + printf("%s add -t [null] [-q nr_queues] [-d depth] [-n dev_id]\n", exe); + printf("\t default: nr_queues=2(max 4), depth=128(max 128), dev_id=-1(auto allocation)\n"); + printf("%s del [-n dev_id] -a \n", exe); + printf("\t -a delete all devices -n delete specified device\n"); + printf("%s list [-n dev_id] -a \n", exe); + printf("\t -a list all devices, -n list specified device, default -a \n"); + printf("%s features\n", exe); + return 0; +} + +int main(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "all", 0, NULL, 'a' }, + { "type", 1, NULL, 't' }, + { "number", 1, NULL, 'n' }, + { "queues", 1, NULL, 'q' }, + { "depth", 1, NULL, 'd' }, + { "debug_mask", 1, NULL, 0 }, + { "quiet", 0, NULL, 0 }, + { 0, 0, 0, 0 } + }; + int option_idx, opt; + const char *cmd = argv[1]; + struct dev_ctx ctx = { + .queue_depth = 128, + .nr_hw_queues = 2, + .dev_id = -1, + .tgt_type = "unknown", + }; + int ret = -EINVAL, i; + + if (argc == 1) + return ret; + + optind = 2; + while ((opt = getopt_long(argc, argv, "t:n:d:q:a", + longopts, &option_idx)) != -1) { + switch (opt) { + case 'a': + ctx.all = 1; + break; + case 'n': + ctx.dev_id = strtol(optarg, NULL, 10); + break; + case 't': + if (strlen(optarg) < sizeof(ctx.tgt_type)) + strcpy(ctx.tgt_type, optarg); + break; + case 'q': + ctx.nr_hw_queues = strtol(optarg, NULL, 10); + break; + case 'd': + ctx.queue_depth = strtol(optarg, NULL, 10); + break; + case 0: + if (!strcmp(longopts[option_idx].name, "debug_mask")) + ublk_dbg_mask = strtol(optarg, NULL, 16); + if (!strcmp(longopts[option_idx].name, "quiet")) + ublk_dbg_mask = 0; + break; + } + } + + i = optind; + while (i < argc && ctx.nr_files < MAX_BACK_FILES) { + ctx.files[ctx.nr_files++] = argv[i++]; + } + + if (!strcmp(cmd, "add")) + ret = cmd_dev_add(&ctx); + else if (!strcmp(cmd, "del")) + ret = cmd_dev_del(&ctx); + else if (!strcmp(cmd, "list")) { + ctx.all = 1; + ret = cmd_dev_list(&ctx); + } else if (!strcmp(cmd, "help")) + ret = cmd_dev_help(argv[0]); + else if (!strcmp(cmd, "features")) + ret = cmd_dev_get_features(); + else + cmd_dev_help(argv[0]); + + return ret; +} diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h new file mode 100644 index 0000000000000..cb2540caa3581 --- /dev/null +++ b/tools/testing/selftests/ublk/kublk.h @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef KUBLK_INTERNAL_H +#define KUBLK_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __maybe_unused __attribute__((unused)) +#define MAX_BACK_FILES 4 +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + +/****************** part 1: libublk ********************/ + +#define CTRL_DEV "/dev/ublk-control" +#define UBLKC_DEV "/dev/ublkc" +#define UBLKB_DEV "/dev/ublkb" +#define UBLK_CTRL_RING_DEPTH 32 +#define ERROR_EVTFD_DEVID -2 + +/* queue idle timeout */ +#define UBLKSRV_IO_IDLE_SECS 20 + +#define UBLK_IO_MAX_BYTES 65536 +#define UBLK_MAX_QUEUES 4 +#define UBLK_QUEUE_DEPTH 128 + +#define UBLK_DBG_DEV (1U << 0) +#define UBLK_DBG_QUEUE (1U << 1) +#define UBLK_DBG_IO_CMD (1U << 2) +#define UBLK_DBG_IO (1U << 3) +#define UBLK_DBG_CTRL_CMD (1U << 4) +#define UBLK_LOG (1U << 5) + +struct ublk_dev; +struct ublk_queue; + +struct dev_ctx { + char tgt_type[16]; + unsigned long flags; + unsigned nr_hw_queues; + unsigned queue_depth; + int dev_id; + int nr_files; + char *files[MAX_BACK_FILES]; + unsigned int logging:1; + unsigned int all:1; + + int _evtfd; +}; + +struct ublk_ctrl_cmd_data { + __u32 cmd_op; +#define CTRL_CMD_HAS_DATA 1 +#define CTRL_CMD_HAS_BUF 2 + __u32 flags; + + __u64 data[2]; + __u64 addr; + __u32 len; +}; + +struct ublk_io { + char *buf_addr; + +#define UBLKSRV_NEED_FETCH_RQ (1UL << 0) +#define UBLKSRV_NEED_COMMIT_RQ_COMP (1UL << 1) +#define UBLKSRV_IO_FREE (1UL << 2) + unsigned short flags; + unsigned short refs; /* used by target code only */ + + int result; +}; + +struct ublk_tgt_ops { + const char *name; + int (*init_tgt)(struct ublk_dev *); + void (*deinit_tgt)(struct ublk_dev *); + + int (*queue_io)(struct ublk_queue *, int tag); + void (*tgt_io_done)(struct ublk_queue *, + int tag, const struct io_uring_cqe *); +}; + +struct ublk_tgt { + unsigned long dev_size; + unsigned int sq_depth; + unsigned int cq_depth; + const struct ublk_tgt_ops *ops; + struct ublk_params params; + char backing_file[1024 - 8 - sizeof(struct ublk_params)]; +}; + +struct ublk_queue { + int q_id; + int q_depth; + unsigned int cmd_inflight; + unsigned int io_inflight; + struct ublk_dev *dev; + const struct ublk_tgt_ops *tgt_ops; + char *io_cmd_buf; + struct io_uring ring; + struct ublk_io ios[UBLK_QUEUE_DEPTH]; +#define UBLKSRV_QUEUE_STOPPING (1U << 0) +#define UBLKSRV_QUEUE_IDLE (1U << 1) +#define UBLKSRV_NO_BUF (1U << 2) + unsigned state; + pid_t tid; + pthread_t thread; +}; + +struct ublk_dev { + struct ublk_tgt tgt; + struct ublksrv_ctrl_dev_info dev_info; + struct ublk_queue q[UBLK_MAX_QUEUES]; + + int fds[2]; /* fds[0] points to /dev/ublkcN */ + int nr_fds; + int ctrl_fd; + struct io_uring ring; +}; + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) +#endif + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + unsigned long __mptr = (unsigned long)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); }) +#endif + +#define round_up(val, rnd) \ + (((val) + ((rnd) - 1)) & ~((rnd) - 1)) + + +extern unsigned int ublk_dbg_mask; +extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag); + +static inline int is_target_io(__u64 user_data) +{ + return (user_data & (1ULL << 63)) != 0; +} + +static inline __u64 build_user_data(unsigned tag, unsigned op, + unsigned tgt_data, unsigned is_target_io) +{ + assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16)); + + return tag | (op << 16) | (tgt_data << 24) | (__u64)is_target_io << 63; +} + +static inline unsigned int user_data_to_tag(__u64 user_data) +{ + return user_data & 0xffff; +} + +static inline unsigned int user_data_to_op(__u64 user_data) +{ + return (user_data >> 16) & 0xff; +} + +static inline void ublk_err(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); +} + +static inline void ublk_log(const char *fmt, ...) +{ + if (ublk_dbg_mask & UBLK_LOG) { + va_list ap; + + va_start(ap, fmt); + vfprintf(stdout, fmt, ap); + } +} + +static inline void ublk_dbg(int level, const char *fmt, ...) +{ + if (level & ublk_dbg_mask) { + va_list ap; + + va_start(ap, fmt); + vfprintf(stdout, fmt, ap); + } +} + +static inline struct io_uring_sqe *ublk_queue_alloc_sqe(struct ublk_queue *q) +{ + unsigned left = io_uring_sq_space_left(&q->ring); + + if (left < 1) + io_uring_submit(&q->ring); + return io_uring_get_sqe(&q->ring); +} + +static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return (void *)&sqe->cmd; +} + +static inline void ublk_mark_io_done(struct ublk_io *io, int res) +{ + io->flags |= (UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_IO_FREE); + io->result = res; +} + +static inline const struct ublksrv_io_desc *ublk_get_iod(const struct ublk_queue *q, int tag) +{ + return (struct ublksrv_io_desc *)&(q->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); +} + +static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) +{ + __u32 *addr = (__u32 *)&sqe->off; + + addr[0] = cmd_op; + addr[1] = 0; +} + +static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) +{ + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + + return ublk_queue_io_cmd(q, io, tag); +} + +extern const struct ublk_tgt_ops null_tgt_ops; + +#endif diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c new file mode 100644 index 0000000000000..b6ef16a8f5145 --- /dev/null +++ b/tools/testing/selftests/ublk/null.c @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "kublk.h" + +static int ublk_null_tgt_init(struct ublk_dev *dev) +{ + const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + unsigned long dev_size = 250UL << 30; + + dev->tgt.dev_size = dev_size; + dev->tgt.params = (struct ublk_params) { + .types = UBLK_PARAM_TYPE_BASIC, + .basic = { + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = info->max_io_buf_bytes >> 9, + .dev_sectors = dev_size >> 9, + }, + }; + + return 0; +} + +static int ublk_null_queue_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + + ublk_complete_io(q, tag, iod->nr_sectors << 9); + return 0; +} + +const struct ublk_tgt_ops null_tgt_ops = { + .name = "null", + .init_tgt = ublk_null_tgt_init, + .queue_io = ublk_null_queue_io, +}; diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh new file mode 100755 index 0000000000000..ffcdfdc2a17f3 --- /dev/null +++ b/tools/testing/selftests/ublk/test_common.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +_check_root() { + local ksft_skip=4 + + if [ $UID != 0 ]; then + echo please run this as root >&2 + exit $ksft_skip + fi +} + +_remove_ublk_devices() { + ${UBLK_PROG} del -a +} + +_get_ublk_dev_state() { + ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' +} + +_get_ublk_daemon_pid() { + ${UBLK_PROG} list -n "$1" | grep "pid" | awk '{print $7}' +} + +_prep_test() { + _check_root + local type=$1 + shift 1 + echo "ublk $type: $@" +} + +_show_result() +{ + if [ $2 -ne 0 ]; then + echo "$1 : [FAIL]" + else + echo "$1 : [PASS]" + fi +} + +_cleanup_test() { + ${UBLK_PROG} del -n $1 +} + +_add_ublk_dev() { + local kublk_temp=`mktemp /tmp/kublk-XXXXXX` + ${UBLK_PROG} add $@ > ${kublk_temp} 2>&1 + if [ $? -ne 0 ]; then + echo "fail to add ublk dev $@" + exit -1 + fi + local dev_id=`grep "dev id" ${kublk_temp} | awk -F '[ :]' '{print $3}'` + udevadm settle + rm -f ${kublk_temp} + echo ${dev_id} +} + +export UBLK_PROG=$(pwd)/kublk diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh new file mode 100755 index 0000000000000..04fc3ac7c716f --- /dev/null +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. test_common.sh + +TID="null_01" +ERR_CODE=0 + +_prep_test "null" "basic IO test" + +dev_id=`_add_ublk_dev -t null` + +# run fio over the two disks +fio --name=job1 --filename=/dev/ublkb${dev_id} --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test ${dev_id} "null" + +_show_result $TID $ERR_CODE From 5d95bfb5357111028b7a37464d1a18702722efe9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 1 Mar 2025 00:19:15 +0800 Subject: [PATCH 065/108] selftests: ublk: add file backed ublk Add file backed ublk target code, meantime add one fio test for covering write verify, another test for mkfs/mount/umount. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250228161919.2869102-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 4 +- tools/testing/selftests/ublk/file_backed.c | 158 +++++++++++++++++++ tools/testing/selftests/ublk/kublk.c | 12 +- tools/testing/selftests/ublk/kublk.h | 8 +- tools/testing/selftests/ublk/test_common.sh | 47 ++++++ tools/testing/selftests/ublk/test_loop_01.sh | 31 ++++ tools/testing/selftests/ublk/test_loop_02.sh | 22 +++ 7 files changed, 277 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/ublk/file_backed.c create mode 100755 tools/testing/selftests/ublk/test_loop_01.sh create mode 100755 tools/testing/selftests/ublk/test_loop_02.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index b6ac306210914..7815aaaefc250 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -4,9 +4,11 @@ CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir) LDLIBS += -lpthread -lm -luring TEST_PROGS := test_null_01.sh +TEST_PROGS += test_loop_01.sh +TEST_PROGS += test_loop_02.sh TEST_GEN_PROGS_EXTENDED = kublk include ../lib.mk -$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c +$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c new file mode 100644 index 0000000000000..64e0a37746eb0 --- /dev/null +++ b/tools/testing/selftests/ublk/file_backed.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kublk.h" + +static void backing_file_tgt_deinit(struct ublk_dev *dev) +{ + int i; + + for (i = 1; i < dev->nr_fds; i++) { + fsync(dev->fds[i]); + close(dev->fds[i]); + } +} + +static int backing_file_tgt_init(struct ublk_dev *dev) +{ + int fd, i; + + assert(dev->nr_fds == 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) { + char *file = dev->tgt.backing_file[i]; + unsigned long bytes; + struct stat st; + + ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); + + fd = open(file, O_RDWR | O_DIRECT); + if (fd < 0) { + ublk_err("%s: backing file %s can't be opened: %s\n", + __func__, file, strerror(errno)); + return -EBADF; + } + + if (fstat(fd, &st) < 0) { + close(fd); + return -EBADF; + } + + if (S_ISREG(st.st_mode)) + bytes = st.st_size; + else if (S_ISBLK(st.st_mode)) { + if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) + return -1; + } else { + return -EINVAL; + } + + dev->tgt.backing_file_size[i] = bytes; + dev->fds[dev->nr_fds] = fd; + dev->nr_fds += 1; + } + + return 0; +} + +static int loop_queue_tgt_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + struct io_uring_sqe *sqe = ublk_queue_alloc_sqe(q); + unsigned ublk_op = ublksrv_get_op(iod); + + if (!sqe) + return -ENOMEM; + + switch (ublk_op) { + case UBLK_IO_OP_FLUSH: + io_uring_prep_sync_file_range(sqe, 1 /*fds[1]*/, + iod->nr_sectors << 9, + iod->start_sector << 9, + IORING_FSYNC_DATASYNC); + io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); + break; + case UBLK_IO_OP_WRITE_ZEROES: + case UBLK_IO_OP_DISCARD: + return -ENOTSUP; + case UBLK_IO_OP_READ: + io_uring_prep_read(sqe, 1 /*fds[1]*/, + (void *)iod->addr, + iod->nr_sectors << 9, + iod->start_sector << 9); + io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); + break; + case UBLK_IO_OP_WRITE: + io_uring_prep_write(sqe, 1 /*fds[1]*/, + (void *)iod->addr, + iod->nr_sectors << 9, + iod->start_sector << 9); + io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); + break; + default: + return -EINVAL; + } + + q->io_inflight++; + /* bit63 marks us as tgt io */ + sqe->user_data = build_user_data(tag, ublk_op, 0, 1); + + ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u\n", __func__, tag, + iod->op_flags, iod->start_sector, iod->nr_sectors << 9); + return 1; +} + +static int ublk_loop_queue_io(struct ublk_queue *q, int tag) +{ + int queued = loop_queue_tgt_io(q, tag); + + if (queued < 0) + ublk_complete_io(q, tag, queued); + + return 0; +} + +static void ublk_loop_io_done(struct ublk_queue *q, int tag, + const struct io_uring_cqe *cqe) +{ + int cqe_tag = user_data_to_tag(cqe->user_data); + + assert(tag == cqe_tag); + ublk_complete_io(q, tag, cqe->res); + q->io_inflight--; +} + +static int ublk_loop_tgt_init(struct ublk_dev *dev) +{ + unsigned long long bytes; + int ret; + struct ublk_params p = { + .types = UBLK_PARAM_TYPE_BASIC, + .basic = { + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, + }, + }; + + assert(dev->tgt.nr_backing_files == 1); + ret = backing_file_tgt_init(dev); + if (ret) + return ret; + + bytes = dev->tgt.backing_file_size[0]; + dev->tgt.dev_size = bytes; + p.basic.dev_sectors = bytes >> 9; + dev->tgt.params = p; + + return 0; +} + +const struct ublk_tgt_ops loop_tgt_ops = { + .name = "loop", + .init_tgt = ublk_loop_tgt_init, + .deinit_tgt = backing_file_tgt_deinit, + .queue_io = ublk_loop_queue_io, + .tgt_io_done = ublk_loop_io_done, +}; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index b2dfd35bc1572..f7b9ede17bb3c 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -8,6 +8,7 @@ unsigned int ublk_dbg_mask = UBLK_LOG; static const struct ublk_tgt_ops *tgt_ops_list[] = { &null_tgt_ops, + &loop_tgt_ops, }; static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) @@ -774,7 +775,7 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) struct ublksrv_ctrl_dev_info *info; struct ublk_dev *dev; int dev_id = ctx->dev_id; - int ret; + int ret, i; ops = ublk_find_tgt(tgt_type); if (!ops) { @@ -813,6 +814,13 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) dev->tgt.sq_depth = depth; dev->tgt.cq_depth = depth; + for (i = 0; i < MAX_BACK_FILES; i++) { + if (ctx->files[i]) { + strcpy(dev->tgt.backing_file[i], ctx->files[i]); + dev->tgt.nr_backing_files++; + } + } + ret = ublk_ctrl_add_dev(dev); if (ret < 0) { ublk_err("%s: can't add dev id %d, type %s ret %d\n", @@ -994,7 +1002,7 @@ static int cmd_dev_get_features(void) static int cmd_dev_help(char *exe) { - printf("%s add -t [null] [-q nr_queues] [-d depth] [-n dev_id]\n", exe); + printf("%s add -t [null|loop] [-q nr_queues] [-d depth] [-n dev_id] [backfile1] [backfile2] ...\n", exe); printf("\t default: nr_queues=2(max 4), depth=128(max 128), dev_id=-1(auto allocation)\n"); printf("%s del [-n dev_id] -a \n", exe); printf("\t -a delete all devices -n delete specified device\n"); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index cb2540caa3581..d71c6c3767a9f 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -105,7 +105,10 @@ struct ublk_tgt { unsigned int cq_depth; const struct ublk_tgt_ops *ops; struct ublk_params params; - char backing_file[1024 - 8 - sizeof(struct ublk_params)]; + + int nr_backing_files; + unsigned long backing_file_size[MAX_BACK_FILES]; + char backing_file[MAX_BACK_FILES][PATH_MAX]; }; struct ublk_queue { @@ -131,7 +134,7 @@ struct ublk_dev { struct ublksrv_ctrl_dev_info dev_info; struct ublk_queue q[UBLK_MAX_QUEUES]; - int fds[2]; /* fds[0] points to /dev/ublkcN */ + int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */ int nr_fds; int ctrl_fd; struct io_uring ring; @@ -248,5 +251,6 @@ static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) } extern const struct ublk_tgt_ops null_tgt_ops; +extern const struct ublk_tgt_ops loop_tgt_ops; #endif diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index ffcdfdc2a17f3..ad40c1511f414 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -1,6 +1,53 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +_create_backfile() { + local my_size=$1 + local my_file=`mktemp ublk_bpf_${my_size}_XXXXX` + + truncate -s ${my_size} ${my_file} + echo $my_file +} + +_remove_backfile() { + local file=$1 + + [ -f "$file" ] && rm -f $file +} + +_create_tmp_dir() { + local my_file=`mktemp -d ublk_bpf_dir_XXXXX` + + echo $my_file +} + +_remove_tmp_dir() { + local dir=$1 + + [ -d "$dir" ] && rmdir $dir +} + +_mkfs_mount_test() +{ + local dev=$1 + local err_code=0 + local mnt_dir=`_create_tmp_dir` + + mkfs.ext4 -F $dev > /dev/null 2>&1 + err_code=$? + if [ $err_code -ne 0 ]; then + return $err_code + fi + + mount -t ext4 $dev $mnt_dir > /dev/null 2>&1 + umount $dev + err_code=$? + _remove_tmp_dir $mnt_dir + if [ $err_code -ne 0 ]; then + return $err_code + fi +} + _check_root() { local ksft_skip=4 diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh new file mode 100755 index 0000000000000..829e8df059423 --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. test_common.sh + +TID="loop_01" +ERR_CODE=0 + +_prep_test "loop" "write and verify test" + +backfile_0=`_create_backfile 256M` + +dev_id=`_add_ublk_dev -t loop $backfile_0` + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb${dev_id} \ + --ioengine=libaio --iodepth=16 \ + --rw=write \ + --size=256M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test ${dev_id} "loop" + +_remove_backfile $backfile_0 + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh new file mode 100755 index 0000000000000..c71ae63059b80 --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. test_common.sh + +TID="loop_02" +ERR_CODE=0 + +_prep_test "loop" "mkfs & mount & umount" + +backfile_0=`_create_backfile 256M` + +dev_id=`_add_ublk_dev -t loop $backfile_0` + +_mkfs_mount_test /dev/ublkb${dev_id} +ERR_CODE=$? + +_cleanup_test ${dev_id} "loop" + +_remove_backfile $backfile_0 + +_show_result $TID $ERR_CODE From bedc9cbc5f9709b97646fe3423dbf530b74b09d5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 1 Mar 2025 00:19:16 +0800 Subject: [PATCH 066/108] selftests: ublk: add ublk zero copy test Enable zero copy on file backed target, meantime add one fio test for covering write verify, another test for mkfs/mount/umount. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250228161919.2869102-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 + tools/testing/selftests/ublk/file_backed.c | 104 +++++++++++++++---- tools/testing/selftests/ublk/kublk.c | 25 ++++- tools/testing/selftests/ublk/kublk.h | 70 +++++++++++++ tools/testing/selftests/ublk/test_common.sh | 8 ++ tools/testing/selftests/ublk/test_loop_03.sh | 33 ++++++ tools/testing/selftests/ublk/test_loop_04.sh | 22 ++++ 7 files changed, 241 insertions(+), 23 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_loop_03.sh create mode 100755 tools/testing/selftests/ublk/test_loop_04.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 7815aaaefc250..555a3ba5b4812 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -6,6 +6,8 @@ LDLIBS += -lpthread -lm -luring TEST_PROGS := test_null_01.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh +TEST_PROGS += test_loop_03.sh +TEST_PROGS += test_loop_04.sh TEST_GEN_PROGS_EXTENDED = kublk diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 64e0a37746eb0..38e68b414962d 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -54,48 +54,94 @@ static int backing_file_tgt_init(struct ublk_dev *dev) return 0; } +static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int zc) +{ + unsigned ublk_op = ublksrv_get_op(iod); + + if (ublk_op == UBLK_IO_OP_READ) + return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; + else if (ublk_op == UBLK_IO_OP_WRITE) + return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; + assert(0); +} + +static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + int zc = ublk_queue_use_zc(q); + enum io_uring_op op = ublk_to_uring_op(iod, zc); + struct io_uring_sqe *reg; + struct io_uring_sqe *rw; + struct io_uring_sqe *ureg; + + if (!zc) { + rw = ublk_queue_alloc_sqe(q); + if (!rw) + return -ENOMEM; + + io_uring_prep_rw(op, rw, 1 /*fds[1]*/, + (void *)iod->addr, + iod->nr_sectors << 9, + iod->start_sector << 9); + io_uring_sqe_set_flags(rw, IOSQE_FIXED_FILE); + q->io_inflight++; + /* bit63 marks us as tgt io */ + rw->user_data = build_user_data(tag, op, UBLK_IO_TGT_NORMAL, 1); + return 0; + } + + ublk_queue_alloc_sqe3(q, ®, &rw, &ureg); + + io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag); + reg->user_data = build_user_data(tag, 0xfe, 1, 1); + reg->flags |= IOSQE_CQE_SKIP_SUCCESS; + reg->flags |= IOSQE_IO_LINK; + + io_uring_prep_rw(op, rw, 1 /*fds[1]*/, 0, + iod->nr_sectors << 9, + iod->start_sector << 9); + rw->buf_index = tag; + rw->flags |= IOSQE_FIXED_FILE; + rw->flags |= IOSQE_IO_LINK; + rw->user_data = build_user_data(tag, op, UBLK_IO_TGT_ZC_OP, 1); + q->io_inflight++; + + io_uring_prep_buf_unregister(ureg, 0, tag, q->q_id, tag); + ureg->user_data = build_user_data(tag, 0xff, UBLK_IO_TGT_ZC_BUF, 1); + q->io_inflight++; + + return 0; +} + static int loop_queue_tgt_io(struct ublk_queue *q, int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); - struct io_uring_sqe *sqe = ublk_queue_alloc_sqe(q); unsigned ublk_op = ublksrv_get_op(iod); - - if (!sqe) - return -ENOMEM; + struct io_uring_sqe *sqe; switch (ublk_op) { case UBLK_IO_OP_FLUSH: + sqe = ublk_queue_alloc_sqe(q); + if (!sqe) + return -ENOMEM; io_uring_prep_sync_file_range(sqe, 1 /*fds[1]*/, iod->nr_sectors << 9, iod->start_sector << 9, IORING_FSYNC_DATASYNC); io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); + q->io_inflight++; + sqe->user_data = build_user_data(tag, ublk_op, UBLK_IO_TGT_NORMAL, 1); break; case UBLK_IO_OP_WRITE_ZEROES: case UBLK_IO_OP_DISCARD: return -ENOTSUP; case UBLK_IO_OP_READ: - io_uring_prep_read(sqe, 1 /*fds[1]*/, - (void *)iod->addr, - iod->nr_sectors << 9, - iod->start_sector << 9); - io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); - break; case UBLK_IO_OP_WRITE: - io_uring_prep_write(sqe, 1 /*fds[1]*/, - (void *)iod->addr, - iod->nr_sectors << 9, - iod->start_sector << 9); - io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); + loop_queue_tgt_rw_io(q, iod, tag); break; default: return -EINVAL; } - q->io_inflight++; - /* bit63 marks us as tgt io */ - sqe->user_data = build_user_data(tag, ublk_op, 0, 1); - ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u\n", __func__, tag, iod->op_flags, iod->start_sector, iod->nr_sectors << 9); return 1; @@ -115,9 +161,22 @@ static void ublk_loop_io_done(struct ublk_queue *q, int tag, const struct io_uring_cqe *cqe) { int cqe_tag = user_data_to_tag(cqe->user_data); + unsigned tgt_data = user_data_to_tgt_data(cqe->user_data); + int res = cqe->res; + + if (res < 0 || tgt_data == UBLK_IO_TGT_NORMAL) + goto complete; + if (tgt_data == UBLK_IO_TGT_ZC_OP) { + ublk_set_io_res(q, tag, cqe->res); + goto exit; + } + assert(tgt_data == UBLK_IO_TGT_ZC_BUF); + res = ublk_get_io_res(q, tag); +complete: assert(tag == cqe_tag); - ublk_complete_io(q, tag, cqe->res); + ublk_complete_io(q, tag, res); +exit: q->io_inflight--; } @@ -126,7 +185,7 @@ static int ublk_loop_tgt_init(struct ublk_dev *dev) unsigned long long bytes; int ret; struct ublk_params p = { - .types = UBLK_PARAM_TYPE_BASIC, + .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, .basic = { .logical_bs_shift = 9, .physical_bs_shift = 12, @@ -134,6 +193,9 @@ static int ublk_loop_tgt_init(struct ublk_dev *dev) .io_min_shift = 9, .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, }, + .dma = { + .alignment = 511, + }, }; assert(dev->tgt.nr_backing_files == 1); diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index f7b9ede17bb3c..b65bdaf7e2815 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -282,6 +282,8 @@ static void ublk_queue_deinit(struct ublk_queue *q) int i; int nr_ios = q->q_depth; + io_uring_unregister_buffers(&q->ring); + io_uring_unregister_ring_fd(&q->ring); if (q->ring.ring_fd > 0) { @@ -312,6 +314,11 @@ static int ublk_queue_init(struct ublk_queue *q) q->cmd_inflight = 0; q->tid = gettid(); + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { + q->state |= UBLKSRV_NO_BUF; + q->state |= UBLKSRV_ZC; + } + cmd_buf_size = ublk_queue_cmd_buf_sz(q); off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); q->io_cmd_buf = (char *)mmap(0, cmd_buf_size, PROT_READ, @@ -346,6 +353,15 @@ static int ublk_queue_init(struct ublk_queue *q) goto fail; } + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { + ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth); + if (ret) { + ublk_err("ublk dev %d queue %d register spare buffers failed %d", + dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + } + io_uring_register_ring_fd(&q->ring); ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds); @@ -502,9 +518,10 @@ static void ublk_handle_cqe(struct io_uring *r, ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, cqe->res, cqe->user_data, q->state); - ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d) stopping %d\n", + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", __func__, cqe->res, q->q_id, tag, cmd_op, is_target_io(cqe->user_data), + user_data_to_tgt_data(cqe->user_data), (q->state & UBLKSRV_QUEUE_STOPPING)); /* Don't retrieve io in case of target io */ @@ -1022,6 +1039,7 @@ int main(int argc, char *argv[]) { "depth", 1, NULL, 'd' }, { "debug_mask", 1, NULL, 0 }, { "quiet", 0, NULL, 0 }, + { "zero_copy", 1, NULL, 'z' }, { 0, 0, 0, 0 } }; int option_idx, opt; @@ -1038,7 +1056,7 @@ int main(int argc, char *argv[]) return ret; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:a", + while ((opt = getopt_long(argc, argv, "t:n:d:q:a:z", longopts, &option_idx)) != -1) { switch (opt) { case 'a': @@ -1057,6 +1075,9 @@ int main(int argc, char *argv[]) case 'd': ctx.queue_depth = strtol(optarg, NULL, 10); break; + case 'z': + ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY; + break; case 0: if (!strcmp(longopts[option_idx].name, "debug_mask")) ublk_dbg_mask = strtol(optarg, NULL, 16); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index d71c6c3767a9f..8e43aebf7dfc3 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -42,6 +42,10 @@ #define UBLK_MAX_QUEUES 4 #define UBLK_QUEUE_DEPTH 128 +#define UBLK_IO_TGT_NORMAL 0 +#define UBLK_IO_TGT_ZC_BUF 1 +#define UBLK_IO_TGT_ZC_OP 2 + #define UBLK_DBG_DEV (1U << 0) #define UBLK_DBG_QUEUE (1U << 1) #define UBLK_DBG_IO_CMD (1U << 2) @@ -124,6 +128,7 @@ struct ublk_queue { #define UBLKSRV_QUEUE_STOPPING (1U << 0) #define UBLKSRV_QUEUE_IDLE (1U << 1) #define UBLKSRV_NO_BUF (1U << 2) +#define UBLKSRV_ZC (1U << 3) unsigned state; pid_t tid; pthread_t thread; @@ -180,6 +185,11 @@ static inline unsigned int user_data_to_op(__u64 user_data) return (user_data >> 16) & 0xff; } +static inline unsigned int user_data_to_tgt_data(__u64 user_data) +{ + return (user_data >> 24) & 0xffff; +} + static inline void ublk_err(const char *fmt, ...) { va_list ap; @@ -217,11 +227,66 @@ static inline struct io_uring_sqe *ublk_queue_alloc_sqe(struct ublk_queue *q) return io_uring_get_sqe(&q->ring); } +static inline void ublk_queue_alloc_sqe3(struct ublk_queue *q, + struct io_uring_sqe **sqe1, struct io_uring_sqe **sqe2, + struct io_uring_sqe **sqe3) +{ + struct io_uring *r = &q->ring; + unsigned left = io_uring_sq_space_left(r); + + if (left < 3) + io_uring_submit(r); + + *sqe1 = io_uring_get_sqe(r); + *sqe2 = io_uring_get_sqe(r); + *sqe3 = io_uring_get_sqe(r); +} + +static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, + int dev_fd, int tag, int q_id, __u64 index) +{ + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags |= IOSQE_FIXED_FILE; + sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; + + cmd->tag = tag; + cmd->addr = index; + cmd->q_id = q_id; +} + +static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, + int dev_fd, int tag, int q_id, __u64 index) +{ + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags |= IOSQE_FIXED_FILE; + sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; + + cmd->tag = tag; + cmd->addr = index; + cmd->q_id = q_id; +} + static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe) { return (void *)&sqe->cmd; } +static inline void ublk_set_io_res(struct ublk_queue *q, int tag, int res) +{ + q->ios[tag].result = res; +} + +static inline int ublk_get_io_res(const struct ublk_queue *q, unsigned tag) +{ + return q->ios[tag].result; +} + static inline void ublk_mark_io_done(struct ublk_io *io, int res) { io->flags |= (UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_IO_FREE); @@ -250,6 +315,11 @@ static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) return ublk_queue_io_cmd(q, io, tag); } +static inline int ublk_queue_use_zc(const struct ublk_queue *q) +{ + return q->state & UBLKSRV_ZC; +} + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index ad40c1511f414..304f22ffda581 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -102,4 +102,12 @@ _add_ublk_dev() { echo ${dev_id} } +_have_feature() +{ + if $UBLK_PROG "features" | grep $1 > /dev/null 2>&1; then + return 0 + fi + return 1 +} + export UBLK_PROG=$(pwd)/kublk diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh new file mode 100755 index 0000000000000..e781ac6db6b4c --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. test_common.sh + +TID="loop_03" +ERR_CODE=0 + +_have_feature "ZERO_COPY" || exit 4 + +_prep_test "loop" "write and verify over zero copy" + +backfile_0=`_create_backfile 256M` + +dev_id=`_add_ublk_dev -t loop $backfile_0 -z` + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb${dev_id} \ + --ioengine=libaio --iodepth=64 \ + --rw=write \ + --size=256M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test ${dev_id} "loop" + +_remove_backfile $backfile_0 + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh new file mode 100755 index 0000000000000..6ab67247c809a --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. test_common.sh + +TID="loop_04" +ERR_CODE=0 + +_prep_test "loop" "mkfs & mount & umount with zero copy" + +backfile_0=`_create_backfile 256M` + +dev_id=`_add_ublk_dev -t loop -z $backfile_0` + +_mkfs_mount_test /dev/ublkb${dev_id} +ERR_CODE=$? + +_cleanup_test ${dev_id} "loop" + +_remove_backfile $backfile_0 + +_show_result $TID $ERR_CODE From 2fced37638a897be4e0ac724d93a23a4e38633a6 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 15:30:56 -0700 Subject: [PATCH 067/108] io_uring/rsrc: use rq_data_dir() to compute bvec dir The macro rq_data_dir() already computes a request's data direction. Use it in place of the if-else to set imu->dir. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228223057.615284-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 45bfb37bca1e6..3107a03d56b83 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -959,11 +959,7 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, imu->release = release; imu->priv = rq; imu->is_kbuf = true; - - if (op_is_write(req_op(rq))) - imu->dir = IO_IMU_SOURCE; - else - imu->dir = IO_IMU_DEST; + imu->dir = 1 << rq_data_dir(rq); bvec = imu->bvec; rq_for_each_bvec(bv, rq, rq_iter) From 0c542a69cbcd1fefad32c59cea7a80413fe60922 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 15:15:13 -0700 Subject: [PATCH 068/108] io_uring/uring_cmd: specify io_uring_cmd_import_fixed() pointer type io_uring_cmd_import_fixed() takes a struct io_uring_cmd *, but the type of the ioucmd parameter is void *. Make the pointer type explicit so the compiler can type check it. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228221514.604350-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 10 ++++++---- io_uring/uring_cmd.c | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cf8d80d847344..5bc4f0d58506d 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -40,7 +40,8 @@ static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd, + struct iov_iter *iter, + struct io_uring_cmd *ioucmd, unsigned int issue_flags); /* @@ -68,9 +69,10 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); #else -static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd, - unsigned int issue_flags) +static inline int +io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, struct io_uring_cmd *ioucmd, + unsigned int issue_flags) { return -EOPNOTSUPP; } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 31d5e0948af14..de39b602aa829 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -245,7 +245,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd, + struct iov_iter *iter, + struct io_uring_cmd *ioucmd, unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); From 09fdd35162c289f354326a55d552a8858f6e8072 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:03:04 -0700 Subject: [PATCH 069/108] io_uring: convert cmd_to_io_kiocb() macro to function The cmd_to_io_kiocb() macro applies a pointer cast to its input without parenthesizing it. Currently all inputs are variable names, so this has the intended effect. But since casts have relatively high precedence, the macro would apply the cast to the wrong value if the input was a pointer addition, for example. Turn the macro into a static inline function to ensure the pointer cast is applied to the full input value. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228230305.630885-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 432c98ff52ee9..72aac84dca93d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -607,7 +607,11 @@ static inline void io_kiocb_cmd_sz_check(size_t cmd_sz) io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ ((cmd_type *)&(req)->cmd) \ ) -#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) + +static inline struct io_kiocb *cmd_to_io_kiocb(void *ptr) +{ + return ptr; +} struct io_kiocb { union { From e6ea7ec494881bcf61b8f0f77f7cb3542f717ff2 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:14:31 -0700 Subject: [PATCH 070/108] io_uring/ublk: report error when unregister operation fails Indicate to userspace applications if a UBLK_IO_UNREGISTER_IO_BUF command specifies an invalid buffer index by returning an error code. Return -EINVAL if no buffer is registered with the given index, and -EBUSY if the registered buffer is not a kernel bvec. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228231432.642417-1-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 +-- include/linux/io_uring/cmd.h | 4 ++-- io_uring/rsrc.c | 18 ++++++++++++++---- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b5cf92baaf0f0..512cbd456817b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1787,8 +1787,7 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, const struct ublksrv_io_cmd *ub_cmd, unsigned int issue_flags) { - io_buffer_unregister_bvec(cmd, ub_cmd->addr, issue_flags); - return 0; + return io_buffer_unregister_bvec(cmd, ub_cmd->addr, issue_flags); } static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 5bc4f0d58506d..598cacda4aa36 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -131,7 +131,7 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); -void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, - unsigned int issue_flags); +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 3107a03d56b83..c9105030f0e37 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -973,26 +973,36 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, } EXPORT_SYMBOL_GPL(io_buffer_register_bvec); -void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, - unsigned int issue_flags) +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) { struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; struct io_rsrc_data *data = &ctx->buf_table; struct io_rsrc_node *node; + int ret = 0; io_ring_submit_lock(ctx, issue_flags); - if (index >= data->nr) + if (index >= data->nr) { + ret = -EINVAL; goto unlock; + } index = array_index_nospec(index, data->nr); node = data->nodes[index]; - if (!node || !node->buf->is_kbuf) + if (!node) { + ret = -EINVAL; goto unlock; + } + if (!node->buf->is_kbuf) { + ret = -EBUSY; + goto unlock; + } io_put_rsrc_node(ctx, node); data->nodes[index] = NULL; unlock: io_ring_submit_unlock(ctx, issue_flags); + return ret; } EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); From bf931be52e5dad336a7576b028567e9179d6278c Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 17:16:07 -0700 Subject: [PATCH 071/108] io_uring/rsrc: declare io_find_buf_node() in header file Declare io_find_buf_node() in io_uring/rsrc.h so it can be called from other files. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250301001610.678223-1-csander@purestorage.com [axboe: keep the inline for local hot path usage] Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 4 ++-- io_uring/rsrc.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index c9105030f0e37..ee4bfdd76f6bc 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1074,8 +1074,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, return 0; } -static inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, - unsigned issue_flags) +inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_rsrc_node *node; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 6fe7b9e615bf5..8f912aa6bcc91 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -55,6 +55,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); +struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags); int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, u64 buf_addr, size_t len, int ddir, unsigned issue_flags); From 6e83a442fbbb5f136c50feb7d137017610bc0738 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 17:16:08 -0700 Subject: [PATCH 072/108] io_uring/nop: use io_find_buf_node() Call io_find_buf_node() to avoid duplicating it in io_nop(). Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250301001610.678223-2-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/nop.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/io_uring/nop.c b/io_uring/nop.c index ea539531cb5f6..28f06285fdc25 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -61,17 +61,8 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) } } if (nop->flags & IORING_NOP_FIXED_BUFFER) { - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - - ret = -EFAULT; - io_ring_submit_lock(ctx, issue_flags); - node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); - if (node) { - io_req_assign_buf_node(req, node); - ret = 0; - } - io_ring_submit_unlock(ctx, issue_flags); + if (!io_find_buf_node(req, issue_flags)) + ret = -EFAULT; } done: if (ret < 0) From 9e12d09cfdaf89db894abdad392bb8dcd6c0f464 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 1 Mar 2025 12:03:16 -0700 Subject: [PATCH 073/108] ublk: don't cast registered buffer index to int io_buffer_register_bvec() takes index as an unsigned int argument, but ublk_register_io_buf() casts ub_cmd->addr (a u64) to int. Remove the misleading cast and instead pass index as an unsigned value to ublk_register_io_buf() and ublk_unregister_io_buf(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250301190317.950208-1-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 512cbd456817b..af5a4ff4bd3d9 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1762,12 +1762,11 @@ static void ublk_io_release(void *priv) static int ublk_register_io_buf(struct io_uring_cmd *cmd, struct ublk_queue *ubq, unsigned int tag, - const struct ublksrv_io_cmd *ub_cmd, - unsigned int issue_flags) + unsigned int index, unsigned int issue_flags) { struct ublk_device *ub = cmd->file->private_data; - int index = (int)ub_cmd->addr, ret; struct request *req; + int ret; req = __ublk_check_and_get_req(ub, ubq, tag, 0); if (!req) @@ -1784,10 +1783,9 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, } static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, - const struct ublksrv_io_cmd *ub_cmd, - unsigned int issue_flags) + unsigned int index, unsigned int issue_flags) { - return io_buffer_unregister_bvec(cmd, ub_cmd->addr, issue_flags); + return io_buffer_unregister_bvec(cmd, index, issue_flags); } static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, @@ -1842,9 +1840,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ret = -EINVAL; switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: - return ublk_register_io_buf(cmd, ubq, tag, ub_cmd, issue_flags); + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); case UBLK_IO_UNREGISTER_IO_BUF: - return ublk_unregister_io_buf(cmd, ub_cmd, issue_flags); + return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags); case UBLK_IO_FETCH_REQ: /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ if (ublk_queue_ready(ubq)) { From a1967280a1e5fb2c331f23d162b3672d64ba2549 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 1 Mar 2025 11:36:11 -0700 Subject: [PATCH 074/108] io_uring/rsrc: include io_uring_types.h in rsrc.h io_uring/rsrc.h uses several types from include/linux/io_uring_types.h. Include io_uring_types.h explicitly in rsrc.h to avoid depending on users of rsrc.h including io_uring_types.h first. Signed-off-by: Caleb Sander Mateos Reviewed-by: Li Zetao Link: https://lore.kernel.org/r/20250301183612.937529-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 8f912aa6bcc91..f10a1252b3e95 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,6 +2,7 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H +#include #include enum { From 6a53541829662c8f1357f522a1d6315179442bf7 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:59:10 -0700 Subject: [PATCH 075/108] io_uring/rsrc: split out io_free_node() helper Split the freeing of the io_rsrc_node from io_free_rsrc_node(), for use with nodes that haven't been fully initialized. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228235916.670437-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index ee4bfdd76f6bc..c8d7284f20b8c 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -487,6 +487,12 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +static void io_free_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) +{ + if (!io_alloc_cache_put(&ctx->node_cache, node)) + kvfree(node); +} + void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (node->tag) @@ -506,8 +512,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) break; } - if (!io_alloc_cache_put(&ctx->node_cache, node)) - kvfree(node); + io_free_node(ctx, node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) From a387b96d2a9687201318826d23c770eb794c778e Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:59:11 -0700 Subject: [PATCH 076/108] io_uring/rsrc: free io_rsrc_node using kfree() io_rsrc_node_alloc() calls io_cache_alloc(), which uses kmalloc() to allocate the node. So it can be freed with kfree() instead of kvfree(). Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228235916.670437-2-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index c8d7284f20b8c..19e9de040a200 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -490,7 +490,7 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) static void io_free_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (!io_alloc_cache_put(&ctx->node_cache, node)) - kvfree(node); + kfree(node); } void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) From 13f7f9686e928dae352972a1a95b50b2d5e80d42 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:59:12 -0700 Subject: [PATCH 077/108] io_uring/rsrc: call io_free_node() on io_sqe_buffer_register() failure io_sqe_buffer_register() currently calls io_put_rsrc_node() if it fails to fully set up the io_rsrc_node. io_put_rsrc_node() is more involved than necessary, since we already know the reference count will reach 0 and no io_mapped_ubuf has been attached to the node yet. So just call io_free_node() to release the node's memory. This also avoids the need to temporarily set the node's buf pointer to NULL. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228235916.670437-3-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 19e9de040a200..8c1717c53d672 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -782,7 +782,6 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) return ERR_PTR(-ENOMEM); - node->buf = NULL; ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, @@ -839,7 +838,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (imu) io_free_imu(ctx, imu); if (node) - io_put_rsrc_node(ctx, node); + io_free_node(ctx, node); node = ERR_PTR(ret); } kvfree(pages); From 6e5d321a08e30f746d63fc56e7ea5c46b06fbe99 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:59:13 -0700 Subject: [PATCH 078/108] io_uring/rsrc: avoid NULL node check on io_sqe_buffer_register() failure The done: label is only reachable if node is non-NULL. So don't bother checking, just call io_free_node(). Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228235916.670437-4-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 8c1717c53d672..806f2c1f5d2e3 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -837,8 +837,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (ret) { if (imu) io_free_imu(ctx, imu); - if (node) - io_free_node(ctx, node); + io_free_node(ctx, node); node = ERR_PTR(ret); } kvfree(pages); From fe21a4532ef2a6852c89b352cb8ded0d37b4745c Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:59:14 -0700 Subject: [PATCH 079/108] io_uring/rsrc: skip NULL file/buffer checks in io_free_rsrc_node() io_rsrc_node's of type IORING_RSRC_FILE always have a file attached immediately after they are allocated. IORING_RSRC_BUFFER nodes won't be returned from io_sqe_buffer_register()/io_buffer_register_bvec() until they have a io_mapped_ubuf attached. So remove the checks for a NULL file/buffer in io_free_rsrc_node(). Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228235916.670437-5-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 806f2c1f5d2e3..3fb1bd616eef7 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -500,12 +500,10 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) switch (node->type) { case IORING_RSRC_FILE: - if (io_slot_file(node)) - fput(io_slot_file(node)); + fput(io_slot_file(node)); break; case IORING_RSRC_BUFFER: - if (node->buf) - io_buffer_unmap(ctx, node->buf); + io_buffer_unmap(ctx, node->buf); break; default: WARN_ON_ONCE(1); From 0d83b8a9f180436a84fbdeb575696b0c3ae0ac0c Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 4 Mar 2025 12:48:12 -0700 Subject: [PATCH 080/108] io_uring: introduce io_cache_free() helper Add a helper function io_cache_free() that returns an allocation to a io_alloc_cache, falling back on kfree() if the io_alloc_cache is full. This is the inverse of io_cache_alloc(), which takes an allocation from an io_alloc_cache and falls back on kmalloc() if the cache is empty. Convert 4 callers to use the helper. Signed-off-by: Caleb Sander Mateos Suggested-by: Li Zetao Link: https://lore.kernel.org/r/20250304194814.2346705-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 6 ++++++ io_uring/futex.c | 4 +--- io_uring/io_uring.c | 3 +-- io_uring/rsrc.c | 15 +++++---------- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 0dd17d8ba93a8..7f68eff2e7f32 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -68,4 +68,10 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) return io_cache_alloc_new(cache, gfp); } +static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) +{ + if (!io_alloc_cache_put(cache, obj)) + kfree(obj); +} + #endif diff --git a/io_uring/futex.c b/io_uring/futex.c index b7581766406ce..0ea4820cd8ff8 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -53,12 +53,10 @@ static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { - struct io_futex_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, tw); - if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) - kfree(ifd); + io_cache_free(&ctx->futex_cache, req->async_data); __io_futex_complete(req, tw); } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ccc343f61a573..58003fa6b327b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1422,8 +1422,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) - kfree(apoll); + io_cache_free(&ctx->apoll_cache, apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 3fb1bd616eef7..5dd1e08275594 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -124,8 +124,9 @@ static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { - if (imu->nr_bvecs > IO_CACHED_BVECS_SEGS || - !io_alloc_cache_put(&ctx->imu_cache, imu)) + if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) + io_cache_free(&ctx->imu_cache, imu); + else kvfree(imu); } @@ -487,12 +488,6 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static void io_free_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) -{ - if (!io_alloc_cache_put(&ctx->node_cache, node)) - kfree(node); -} - void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (node->tag) @@ -510,7 +505,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) break; } - io_free_node(ctx, node); + io_cache_free(&ctx->node_cache, node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -835,7 +830,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (ret) { if (imu) io_free_imu(ctx, imu); - io_free_node(ctx, node); + io_cache_free(&ctx->node_cache, node); node = ERR_PTR(ret); } kvfree(pages); From 30c970354ce2a4c6ad3a4c70040accd34082f477 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 5 Mar 2025 09:34:54 +0800 Subject: [PATCH 081/108] io_uring: Remove unused declaration io_alloc_async_data() Commit ef623a647f42 ("io_uring: Move old async data allocation helper to header") leave behind this unused declaration. Signed-off-by: Yue Haibing Link: https://lore.kernel.org/r/20250305013454.3635021-1-yuehaibing@huawei.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index d5c9b7a6911d0..372129e24372d 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -88,7 +88,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned flags); -bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); From 9894e0eaae980df1ed3f2e86a487fe4c8ef1ab46 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:11 +0800 Subject: [PATCH 082/108] selftests: ublk: make ublk_stop_io_daemon() more reliable Improve ublk_stop_io_daemon() in the following ways: - don't wait if ->ublksrv_pid becomes -1, which means that the disk has been stopped - don't wait if ublk char device doesn't exist any more, so we can avoid to rely on inoitfy for wait until the char device is closed And this way may reduce time of delete command a lot. Signed-off-by: Ming Lei Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20250303124324.3563605-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index b65bdaf7e2815..2072d880fdc4e 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -691,13 +691,14 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) return ret; } -static int wait_ublk_dev(char *dev_name, int evt_mask, unsigned timeout) +static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout) { #define EV_SIZE (sizeof(struct inotify_event)) #define EV_BUF_LEN (128 * (EV_SIZE + 16)) struct pollfd pfd; int fd, wd; int ret = -EINVAL; + const char *dev_name = basename(path); fd = inotify_init(); if (fd < 0) { @@ -761,18 +762,23 @@ static int ublk_stop_io_daemon(const struct ublk_dev *dev) char ublkc[64]; int ret = 0; + if (daemon_pid < 0) + return 0; + /* daemon may be dead already */ if (kill(daemon_pid, 0) < 0) goto wait; - /* - * Wait until ublk char device is closed, when our daemon is shutdown - */ - snprintf(ublkc, sizeof(ublkc), "%s%d", "ublkc", dev_id); - ret = wait_ublk_dev(ublkc, IN_CLOSE_WRITE, 10); - /* double check and inotify may not be 100% reliable */ + snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id); + + /* ublk char device may be gone already */ + if (access(ublkc, F_OK) != 0) + goto wait; + + /* Wait until ublk char device is closed, when the daemon is shutdown */ + ret = wait_ublk_dev(ublkc, IN_CLOSE, 10); + /* double check and since it may be closed before starting inotify */ if (ret == -ETIMEDOUT) - /* the daemon doesn't exist now if kill(0) fails */ ret = kill(daemon_pid, 0) < 0; wait: waitpid(daemon_pid, NULL, 0); @@ -910,8 +916,6 @@ static int __cmd_dev_del(struct dev_ctx *ctx) __func__, dev->dev_info.ublksrv_pid, number, ret); ublk_ctrl_del_dev(dev); fail: - if (ret >= 0) - ret = ublk_ctrl_get_info(dev); ublk_ctrl_deinit(dev); return (ret >= 0) ? 0 : ret; From 9d80f48c5e08b2e003e506c6e5326a35a652ea2f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:12 +0800 Subject: [PATCH 083/108] selftests: ublk: fix build failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following build failure: ublk//file_backed.c: In function ‘backing_file_tgt_init’: ublk//file_backed.c:28:42: error: ‘O_DIRECT’ undeclared (first use in this function); did you mean ‘O_DIRECTORY’? 28 | fd = open(file, O_RDWR | O_DIRECT); | ^~~~~~~~ | O_DIRECTORY when trying to reuse this same utility for liburing test. Signed-off-by: Ming Lei Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20250303124324.3563605-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 8e43aebf7dfc3..8f48eb8568ab4 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include From 2ecdcdfee58c028c15ed00b691104249370db075 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:13 +0800 Subject: [PATCH 084/108] selftests: ublk: add --foreground command line Add --foreground command for helping to debug. Signed-off-by: Ming Lei Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20250303124324.3563605-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 17 +++++++++++++---- tools/testing/selftests/ublk/kublk.h | 1 + 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 2072d880fdc4e..24557a3e55082 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -679,7 +679,10 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) } ublk_ctrl_get_info(dev); - ublk_send_dev_event(ctx, dev->dev_info.dev_id); + if (ctx->fg) + ublk_ctrl_dump(dev); + else + ublk_send_dev_event(ctx, dev->dev_info.dev_id); /* wait until we are terminated */ for (i = 0; i < dinfo->nr_hw_queues; i++) @@ -867,6 +870,9 @@ static int cmd_dev_add(struct dev_ctx *ctx) { int res; + if (ctx->fg) + goto run; + ctx->_evtfd = eventfd(0, 0); if (ctx->_evtfd < 0) { ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno)); @@ -876,8 +882,9 @@ static int cmd_dev_add(struct dev_ctx *ctx) setsid(); res = fork(); if (res == 0) { - __cmd_dev_add(ctx); - exit(EXIT_SUCCESS); +run: + res = __cmd_dev_add(ctx); + return res; } else if (res > 0) { uint64_t id; @@ -1044,6 +1051,7 @@ int main(int argc, char *argv[]) { "debug_mask", 1, NULL, 0 }, { "quiet", 0, NULL, 0 }, { "zero_copy", 1, NULL, 'z' }, + { "foreground", 0, NULL, 0 }, { 0, 0, 0, 0 } }; int option_idx, opt; @@ -1087,7 +1095,8 @@ int main(int argc, char *argv[]) ublk_dbg_mask = strtol(optarg, NULL, 16); if (!strcmp(longopts[option_idx].name, "quiet")) ublk_dbg_mask = 0; - break; + if (!strcmp(longopts[option_idx].name, "foreground")) + ctx.fg = 1; } } diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 8f48eb8568ab4..26d9aa9c5ca29 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -67,6 +67,7 @@ struct dev_ctx { char *files[MAX_BACK_FILES]; unsigned int logging:1; unsigned int all:1; + unsigned int fg:1; int _evtfd; }; From 5b2db7a8c7e4cb0715d7eb04182aca8ee1eb30d5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:14 +0800 Subject: [PATCH 085/108] selftests: ublk: fix parsing '-a' argument The argument of '-a' doesn't follow any value, so fix it by putting it with '-z' together. Fixes: bedc9cbc5f97 ("selftests: ublk: add ublk zero copy test") Signed-off-by: Ming Lei Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20250303124324.3563605-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 24557a3e55082..148355717ee72 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1068,7 +1068,7 @@ int main(int argc, char *argv[]) return ret; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:a:z", + while ((opt = getopt_long(argc, argv, "t:n:d:q:az", longopts, &option_idx)) != -1) { switch (opt) { case 'a': From 632051ffbd90ba39947230297871c34e58e9fdad Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:15 +0800 Subject: [PATCH 086/108] selftests: ublk: support shellcheck and fix all warning Add shellcheck, meantime fixes all warnings. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250303124324.3563605-6-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 3 ++ tools/testing/selftests/ublk/test_common.sh | 57 +++++++++++--------- tools/testing/selftests/ublk/test_loop_01.sh | 10 ++-- tools/testing/selftests/ublk/test_loop_02.sh | 10 ++-- tools/testing/selftests/ublk/test_loop_03.sh | 10 ++-- tools/testing/selftests/ublk/test_loop_04.sh | 10 ++-- tools/testing/selftests/ublk/test_null_01.sh | 6 +-- 7 files changed, 58 insertions(+), 48 deletions(-) diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 555a3ba5b4812..9415f6f6df484 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -14,3 +14,6 @@ TEST_GEN_PROGS_EXTENDED = kublk include ../lib.mk $(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c + +check: + shellcheck -x -f gcc *.sh diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 304f22ffda581..61044cb581381 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -3,46 +3,49 @@ _create_backfile() { local my_size=$1 - local my_file=`mktemp ublk_bpf_${my_size}_XXXXX` + local my_file - truncate -s ${my_size} ${my_file} - echo $my_file + my_file=$(mktemp ublk_file_"${my_size}"_XXXXX) + truncate -s "${my_size}" "${my_file}" + echo "$my_file" } _remove_backfile() { local file=$1 - [ -f "$file" ] && rm -f $file + [ -f "$file" ] && rm -f "$file" } _create_tmp_dir() { - local my_file=`mktemp -d ublk_bpf_dir_XXXXX` + local my_file; - echo $my_file + my_file=$(mktemp -d ublk_dir_XXXXX) + echo "$my_file" } _remove_tmp_dir() { local dir=$1 - [ -d "$dir" ] && rmdir $dir + [ -d "$dir" ] && rmdir "$dir" } _mkfs_mount_test() { local dev=$1 local err_code=0 - local mnt_dir=`_create_tmp_dir` + local mnt_dir; - mkfs.ext4 -F $dev > /dev/null 2>&1 + mnt_dir=$(_create_tmp_dir) + mkfs.ext4 -F "$dev" > /dev/null 2>&1 err_code=$? if [ $err_code -ne 0 ]; then return $err_code fi - mount -t ext4 $dev $mnt_dir > /dev/null 2>&1 - umount $dev + mount -t ext4 "$dev" "$mnt_dir" > /dev/null 2>&1 + umount "$dev" err_code=$? - _remove_tmp_dir $mnt_dir + _remove_tmp_dir "$mnt_dir" if [ $err_code -ne 0 ]; then return $err_code fi @@ -73,12 +76,12 @@ _prep_test() { _check_root local type=$1 shift 1 - echo "ublk $type: $@" + echo "ublk $type: $*" } _show_result() { - if [ $2 -ne 0 ]; then + if [ "$2" -ne 0 ]; then echo "$1 : [FAIL]" else echo "$1 : [PASS]" @@ -86,28 +89,32 @@ _show_result() } _cleanup_test() { - ${UBLK_PROG} del -n $1 + "${UBLK_PROG}" del -n "$1" } _add_ublk_dev() { - local kublk_temp=`mktemp /tmp/kublk-XXXXXX` - ${UBLK_PROG} add $@ > ${kublk_temp} 2>&1 - if [ $? -ne 0 ]; then - echo "fail to add ublk dev $@" - exit -1 + local kublk_temp; + local dev_id; + + kublk_temp=$(mktemp /tmp/kublk-XXXXXX) + if ! "${UBLK_PROG}" add "$@" > "${kublk_temp}" 2>&1; then + echo "fail to add ublk dev $*" + return 255 fi - local dev_id=`grep "dev id" ${kublk_temp} | awk -F '[ :]' '{print $3}'` + + dev_id=$(grep "dev id" "${kublk_temp}" | awk -F '[ :]' '{print $3}') udevadm settle - rm -f ${kublk_temp} - echo ${dev_id} + rm -f "${kublk_temp}" + echo "${dev_id}" } _have_feature() { - if $UBLK_PROG "features" | grep $1 > /dev/null 2>&1; then + if "$UBLK_PROG" "features" | grep "$1" > /dev/null 2>&1; then return 0 fi return 1 } -export UBLK_PROG=$(pwd)/kublk +UBLK_PROG=$(pwd)/kublk +export UBLK_PROG diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index 829e8df059423..1d3f934dca4c7 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -8,13 +8,13 @@ ERR_CODE=0 _prep_test "loop" "write and verify test" -backfile_0=`_create_backfile 256M` +backfile_0=$(_create_backfile 256M) -dev_id=`_add_ublk_dev -t loop $backfile_0` +dev_id=$(_add_ublk_dev -t loop "$backfile_0") # run fio over the ublk disk fio --name=write_and_verify \ - --filename=/dev/ublkb${dev_id} \ + --filename=/dev/ublkb"${dev_id}" \ --ioengine=libaio --iodepth=16 \ --rw=write \ --size=256M \ @@ -24,8 +24,8 @@ fio --name=write_and_verify \ --bs=4k > /dev/null 2>&1 ERR_CODE=$? -_cleanup_test ${dev_id} "loop" +_cleanup_test "${dev_id}" "loop" -_remove_backfile $backfile_0 +_remove_backfile "$backfile_0" _show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh index c71ae63059b80..df06b78048819 100755 --- a/tools/testing/selftests/ublk/test_loop_02.sh +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -8,15 +8,15 @@ ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount" -backfile_0=`_create_backfile 256M` +backfile_0=$(_create_backfile 256M) -dev_id=`_add_ublk_dev -t loop $backfile_0` +dev_id=$(_add_ublk_dev -t loop "$backfile_0") -_mkfs_mount_test /dev/ublkb${dev_id} +_mkfs_mount_test /dev/ublkb"${dev_id}" ERR_CODE=$? -_cleanup_test ${dev_id} "loop" +_cleanup_test "${dev_id}" "loop" -_remove_backfile $backfile_0 +_remove_backfile "$backfile_0" _show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index e781ac6db6b4c..2255b4296590f 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -10,13 +10,13 @@ _have_feature "ZERO_COPY" || exit 4 _prep_test "loop" "write and verify over zero copy" -backfile_0=`_create_backfile 256M` +backfile_0=$(_create_backfile 256M) -dev_id=`_add_ublk_dev -t loop $backfile_0 -z` +dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") # run fio over the ublk disk fio --name=write_and_verify \ - --filename=/dev/ublkb${dev_id} \ + --filename=/dev/ublkb"${dev_id}" \ --ioengine=libaio --iodepth=64 \ --rw=write \ --size=256M \ @@ -26,8 +26,8 @@ fio --name=write_and_verify \ --bs=4k > /dev/null 2>&1 ERR_CODE=$? -_cleanup_test ${dev_id} "loop" +_cleanup_test "${dev_id}" "loop" -_remove_backfile $backfile_0 +_remove_backfile "$backfile_0" _show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh index 6ab67247c809a..a797b25213ece 100755 --- a/tools/testing/selftests/ublk/test_loop_04.sh +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -8,15 +8,15 @@ ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with zero copy" -backfile_0=`_create_backfile 256M` +backfile_0=$(_create_backfile 256M) -dev_id=`_add_ublk_dev -t loop -z $backfile_0` +dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") -_mkfs_mount_test /dev/ublkb${dev_id} +_mkfs_mount_test /dev/ublkb"${dev_id}" ERR_CODE=$? -_cleanup_test ${dev_id} "loop" +_cleanup_test "${dev_id}" "loop" -_remove_backfile $backfile_0 +_remove_backfile "$backfile_0" _show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index 04fc3ac7c716f..b048ddc4ae6f2 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -8,12 +8,12 @@ ERR_CODE=0 _prep_test "null" "basic IO test" -dev_id=`_add_ublk_dev -t null` +dev_id=$(_add_ublk_dev -t null) # run fio over the two disks -fio --name=job1 --filename=/dev/ublkb${dev_id} --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 ERR_CODE=$? -_cleanup_test ${dev_id} "null" +_cleanup_test "${dev_id}" "null" _show_result $TID $ERR_CODE From c83b089a70ec1e81fc7899aacf4de56a1872585a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:16 +0800 Subject: [PATCH 087/108] selftests: ublk: don't pass ${dev_id} to _cleanup_test() More devices can be created in single tests, so simply remove all ublk devices in _cleanup_test(), meantime remove the ${dev_id} argument of _cleanup_test(). Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250303124324.3563605-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 2 +- tools/testing/selftests/ublk/test_loop_01.sh | 2 +- tools/testing/selftests/ublk/test_loop_02.sh | 2 +- tools/testing/selftests/ublk/test_loop_03.sh | 2 +- tools/testing/selftests/ublk/test_loop_04.sh | 2 +- tools/testing/selftests/ublk/test_null_01.sh | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 61044cb581381..d70690281d14b 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -89,7 +89,7 @@ _show_result() } _cleanup_test() { - "${UBLK_PROG}" del -n "$1" + "${UBLK_PROG}" del -a } _add_ublk_dev() { diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index 1d3f934dca4c7..48a85796ca436 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -24,7 +24,7 @@ fio --name=write_and_verify \ --bs=4k > /dev/null 2>&1 ERR_CODE=$? -_cleanup_test "${dev_id}" "loop" +_cleanup_test "loop" _remove_backfile "$backfile_0" diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh index df06b78048819..0a4b5fadbc73a 100755 --- a/tools/testing/selftests/ublk/test_loop_02.sh +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -15,7 +15,7 @@ dev_id=$(_add_ublk_dev -t loop "$backfile_0") _mkfs_mount_test /dev/ublkb"${dev_id}" ERR_CODE=$? -_cleanup_test "${dev_id}" "loop" +_cleanup_test "loop" _remove_backfile "$backfile_0" diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index 2255b4296590f..5a11356e502c3 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -26,7 +26,7 @@ fio --name=write_and_verify \ --bs=4k > /dev/null 2>&1 ERR_CODE=$? -_cleanup_test "${dev_id}" "loop" +_cleanup_test "loop" _remove_backfile "$backfile_0" diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh index a797b25213ece..7e0d4dd8127e1 100755 --- a/tools/testing/selftests/ublk/test_loop_04.sh +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -15,7 +15,7 @@ dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") _mkfs_mount_test /dev/ublkb"${dev_id}" ERR_CODE=$? -_cleanup_test "${dev_id}" "loop" +_cleanup_test "loop" _remove_backfile "$backfile_0" diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index b048ddc4ae6f2..af11e73b7df6e 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -14,6 +14,6 @@ dev_id=$(_add_ublk_dev -t null) fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 ERR_CODE=$? -_cleanup_test "${dev_id}" "null" +_cleanup_test "null" _show_result $TID $ERR_CODE From c2cb669a86c153d3449b672191ab253cdda8a295 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:17 +0800 Subject: [PATCH 088/108] selftests: ublk: move zero copy feature check into _add_ublk_dev() Move zero copy feature check into _add_ublk_dev() since we will have more tests which requires to cover zero copy. Then one check function of _check_add_dev() has to be added for dealing with cleanup since '_add_ublk_dev()' is run in sub-shell, and we can't exit from it to terminal shell. Meantime always return error code from _add_ublk_dev(). Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250303124324.3563605-8-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 56 ++++++++++++++++---- tools/testing/selftests/ublk/test_loop_01.sh | 1 + tools/testing/selftests/ublk/test_loop_02.sh | 2 +- tools/testing/selftests/ublk/test_loop_03.sh | 4 +- tools/testing/selftests/ublk/test_loop_04.sh | 2 +- tools/testing/selftests/ublk/test_null_01.sh | 1 + 6 files changed, 50 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index d70690281d14b..40bf42f1bed25 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +UBLK_SKIP_CODE=4 + _create_backfile() { local my_size=$1 local my_file @@ -79,12 +81,37 @@ _prep_test() { echo "ublk $type: $*" } +_remove_test_files() +{ + local files=$* + + for file in ${files}; do + [ -f "${file}" ] && rm -f "${file}" + done +} + _show_result() { - if [ "$2" -ne 0 ]; then - echo "$1 : [FAIL]" - else + if [ "$2" -eq 0 ]; then echo "$1 : [PASS]" + elif [ "$2" -eq 4 ]; then + echo "$1 : [SKIP]" + else + echo "$1 : [FAIL]" + fi + [ "$2" -ne 0 ] && exit "$2" + return 0 +} + +# don't call from sub-shell, otherwise can't exit +_check_add_dev() +{ + local tid=$1 + local code=$2 + shift 2 + if [ "${code}" -ne 0 ]; then + _remove_test_files "$@" + _show_result "${tid}" "${code}" fi } @@ -92,13 +119,28 @@ _cleanup_test() { "${UBLK_PROG}" del -a } +_have_feature() +{ + if $UBLK_PROG "features" | grep "$1" > /dev/null 2>&1; then + return 0 + fi + return 1 +} + _add_ublk_dev() { local kublk_temp; local dev_id; + if echo "$@" | grep -q "\-z"; then + if ! _have_feature "ZERO_COPY"; then + return ${UBLK_SKIP_CODE} + fi + fi + kublk_temp=$(mktemp /tmp/kublk-XXXXXX) if ! "${UBLK_PROG}" add "$@" > "${kublk_temp}" 2>&1; then echo "fail to add ublk dev $*" + rm -f "${kublk_temp}" return 255 fi @@ -108,13 +150,5 @@ _add_ublk_dev() { echo "${dev_id}" } -_have_feature() -{ - if "$UBLK_PROG" "features" | grep "$1" > /dev/null 2>&1; then - return 0 - fi - return 1 -} - UBLK_PROG=$(pwd)/kublk export UBLK_PROG diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index 48a85796ca436..12bba9e5daa59 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -11,6 +11,7 @@ _prep_test "loop" "write and verify test" backfile_0=$(_create_backfile 256M) dev_id=$(_add_ublk_dev -t loop "$backfile_0") +_check_add_dev $TID $? "${backfile_0}" # run fio over the ublk disk fio --name=write_and_verify \ diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh index 0a4b5fadbc73a..9a163296ac83f 100755 --- a/tools/testing/selftests/ublk/test_loop_02.sh +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -9,8 +9,8 @@ ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount" backfile_0=$(_create_backfile 256M) - dev_id=$(_add_ublk_dev -t loop "$backfile_0") +_check_add_dev $TID $? "$backfile_0" _mkfs_mount_test /dev/ublkb"${dev_id}" ERR_CODE=$? diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index 5a11356e502c3..72a1d072cfbd9 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -6,13 +6,11 @@ TID="loop_03" ERR_CODE=0 -_have_feature "ZERO_COPY" || exit 4 - _prep_test "loop" "write and verify over zero copy" backfile_0=$(_create_backfile 256M) - dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") +_check_add_dev $TID $? "$backfile_0" # run fio over the ublk disk fio --name=write_and_verify \ diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh index 7e0d4dd8127e1..676c4652d758d 100755 --- a/tools/testing/selftests/ublk/test_loop_04.sh +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -9,8 +9,8 @@ ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with zero copy" backfile_0=$(_create_backfile 256M) - dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") +_check_add_dev $TID $? "$backfile_0" _mkfs_mount_test /dev/ublkb"${dev_id}" ERR_CODE=$? diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index af11e73b7df6e..e2847a50823a8 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -9,6 +9,7 @@ ERR_CODE=0 _prep_test "null" "basic IO test" dev_id=$(_add_ublk_dev -t null) +_check_add_dev $TID $? # run fio over the two disks fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 From 87a9265213c390d9d6fa02f0a1ee0110c2bba866 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:18 +0800 Subject: [PATCH 089/108] selftests: ublk: load/unload ublk_drv when preparing & cleaning up tests Load ublk_drv module in _prep_test(), and unload it in _cleanup_test(), so that test can always be done in consistent state. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250303124324.3563605-9-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 40bf42f1bed25..bcb0c7aa3956e 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -64,6 +64,7 @@ _check_root() { _remove_ublk_devices() { ${UBLK_PROG} del -a + modprobe -r ublk_drv } _get_ublk_dev_state() { @@ -78,6 +79,7 @@ _prep_test() { _check_root local type=$1 shift 1 + modprobe ublk_drv echo "ublk $type: $*" } @@ -131,6 +133,9 @@ _add_ublk_dev() { local kublk_temp; local dev_id; + if [ ! -c /dev/ublk-control ]; then + return ${UBLK_SKIP_CODE} + fi if echo "$@" | grep -q "\-z"; then if ! _have_feature "ZERO_COPY"; then return ${UBLK_SKIP_CODE} From c60ac48eab6107445ea88f13f64af3d0766f61d8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:19 +0800 Subject: [PATCH 090/108] selftests: ublk: add one stress test for covering IO vs. removing device Add stress_test_01 for running IO vs. removing device for verifying that ublk device removal can work as expected when heavy IO workloads are in progress. null, loop and loop/zc are covered in this tests. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250303124324.3563605-10-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 + tools/testing/selftests/ublk/test_common.sh | 26 ++++++++++ .../testing/selftests/ublk/test_stress_01.sh | 47 +++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_stress_01.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 9415f6f6df484..40af938cd277a 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -9,6 +9,8 @@ TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh TEST_PROGS += test_loop_04.sh +TEST_PROGS += test_stress_01.sh + TEST_GEN_PROGS_EXTENDED = kublk include ../lib.mk diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index bcb0c7aa3956e..89244a7e275c2 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -155,5 +155,31 @@ _add_ublk_dev() { echo "${dev_id}" } +__remove_ublk_dev_return() { + local dev_id=$1 + + ${UBLK_PROG} del -n "${dev_id}" + local res=$? + udevadm settle + return ${res} +} + +__run_io_and_remove() +{ + local dev_id=$1 + local size=$2 + + fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \ + --rw=readwrite --iodepth=64 --size="${size}" --numjobs=4 \ + --runtime=20 --time_based > /dev/null 2>&1 & + sleep 2 + if ! __remove_ublk_dev_return "${dev_id}"; then + echo "delete dev ${dev_id} failed" + return 255 + fi + wait +} + + UBLK_PROG=$(pwd)/kublk export UBLK_PROG diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh new file mode 100755 index 0000000000000..2dfd01cfd2659 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. test_common.sh +TID="stress_01" +ERR_CODE=0 +DEV_ID=-1 + +ublk_io_and_remove() +{ + local size=$1 + shift 1 + local backfile="" + if echo "$@" | grep -q "loop"; then + backfile=${*: -1} + fi + DEV_ID=$(_add_ublk_dev "$@") + _check_add_dev $TID $? "${backfile}" + + echo "run ublk IO vs. remove device(ublk add $*)" + if ! __run_io_and_remove "${DEV_ID}" "${size}"; then + echo "/dev/ublkc${DEV_ID} isn't removed" + _remove_backfile "${backfile}" + exit 255 + fi +} + +_prep_test "stress" "run IO and remove device" + +ublk_io_and_remove 8G -t null +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +BACK_FILE=$(_create_backfile 256M) +ublk_io_and_remove 256M -t loop "${BACK_FILE}" +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +ublk_io_and_remove 256M -t loop -z "${BACK_FILE}" +ERR_CODE=$? +_cleanup_test "stress" +_remove_backfile "${BACK_FILE}" +_show_result $TID $ERR_CODE From af83ccc7db3a3cb48bc8e9f622500a303e06e3c9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:20 +0800 Subject: [PATCH 091/108] selftests: ublk: add stress test for covering IO vs. killing ublk server Add stress_test_01 for running IO vs. killing ublk server, so io_uring exit & cancel code path can be covered, same with ublk's cancel code path. Especially IO buffer lifetime is one big thing for ublk zero copy, the added test can verify if this area works as expected. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250303124324.3563605-11-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_common.sh | 29 ++++++++++++ .../testing/selftests/ublk/test_stress_01.sh | 2 +- .../testing/selftests/ublk/test_stress_02.sh | 47 +++++++++++++++++++ 4 files changed, 78 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/ublk/test_stress_02.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 40af938cd277a..5d8d5939f051e 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -10,6 +10,7 @@ TEST_PROGS += test_loop_03.sh TEST_PROGS += test_loop_04.sh TEST_PROGS += test_stress_01.sh +TEST_PROGS += test_stress_02.sh TEST_GEN_PROGS_EXTENDED = kublk diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 89244a7e275c2..92596d0d00130 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -155,6 +155,26 @@ _add_ublk_dev() { echo "${dev_id}" } +# kill the ublk daemon and return ublk device state +__ublk_kill_daemon() +{ + local dev_id=$1 + local exp_state=$2 + local daemon_pid + local state + + daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") + state=$(_get_ublk_dev_state "${dev_id}") + + for ((j=0;j<50;j++)); do + [ "$state" == "$exp_state" ] && break + kill -9 "$daemon_pid" > /dev/null 2>&1 + sleep 1 + state=$(_get_ublk_dev_state "${dev_id}") + done + echo "$state" +} + __remove_ublk_dev_return() { local dev_id=$1 @@ -168,11 +188,20 @@ __run_io_and_remove() { local dev_id=$1 local size=$2 + local kill_server=$3 fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \ --rw=readwrite --iodepth=64 --size="${size}" --numjobs=4 \ --runtime=20 --time_based > /dev/null 2>&1 & sleep 2 + if [ "${kill_server}" = "yes" ]; then + local state + state=$(__ublk_kill_daemon "${dev_id}" "DEAD") + if [ "$state" != "DEAD" ]; then + echo "device isn't dead($state) after killing daemon" + return 255 + fi + fi if ! __remove_ublk_dev_return "${dev_id}"; then echo "delete dev ${dev_id} failed" return 255 diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh index 2dfd01cfd2659..c1cdde3e79f76 100755 --- a/tools/testing/selftests/ublk/test_stress_01.sh +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -18,7 +18,7 @@ ublk_io_and_remove() _check_add_dev $TID $? "${backfile}" echo "run ublk IO vs. remove device(ublk add $*)" - if ! __run_io_and_remove "${DEV_ID}" "${size}"; then + if ! __run_io_and_remove "${DEV_ID}" "${size}" "no"; then echo "/dev/ublkc${DEV_ID} isn't removed" _remove_backfile "${backfile}" exit 255 diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh new file mode 100755 index 0000000000000..ec758f649a97c --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. test_common.sh +TID="stress_02" +ERR_CODE=0 +DEV_ID=-1 + +ublk_io_and_kill_daemon() +{ + local size=$1 + shift 1 + local backfile="" + if echo "$@" | grep -q "loop"; then + backfile=${*: -1} + fi + DEV_ID=$(_add_ublk_dev "$@") + _check_add_dev $TID $? "${backfile}" + + echo "run ublk IO vs kill ublk server(ublk add $*)" + if ! __run_io_and_remove "${DEV_ID}" "${size}" "yes"; then + echo "/dev/ublkc${DEV_ID} isn't removed res ${res}" + _remove_backfile "${backfile}" + exit 255 + fi +} + +_prep_test "stress" "run IO and kill ublk server" + +ublk_io_and_kill_daemon 8G -t null +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +BACK_FILE=$(_create_backfile 256M) +ublk_io_and_kill_daemon 256M -t loop "${BACK_FILE}" +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +ublk_io_and_kill_daemon 256M -t loop -z "${BACK_FILE}" +ERR_CODE=$? +_cleanup_test "stress" +_remove_backfile "${BACK_FILE}" +_show_result $TID $ERR_CODE From 390174c91d2d67297cfc4faa4d4bf5a8a090ac33 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 3 Mar 2025 20:43:21 +0800 Subject: [PATCH 092/108] selftests: ublk: improve test usability Add UBLK_TEST_QUIET, so we can print test result(PASS/SKIP/FAIL) only. Also always run from test script's current directory, then the same test script can be started from other work directory. This way helps a lot to reuse this test source code and scripts for other projects(liburing, blktests, ...) Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250303124324.3563605-12-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 10 ++++++++-- tools/testing/selftests/ublk/test_loop_01.sh | 2 +- tools/testing/selftests/ublk/test_loop_02.sh | 2 +- tools/testing/selftests/ublk/test_loop_03.sh | 2 +- tools/testing/selftests/ublk/test_loop_04.sh | 2 +- tools/testing/selftests/ublk/test_null_01.sh | 2 +- tools/testing/selftests/ublk/test_stress_01.sh | 4 ++-- tools/testing/selftests/ublk/test_stress_02.sh | 4 ++-- 8 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 92596d0d00130..350380facd9ff 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -80,7 +80,7 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv - echo "ublk $type: $*" + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" } _remove_test_files() @@ -209,6 +209,12 @@ __run_io_and_remove() wait } +_ublk_test_top_dir() +{ + cd "$(dirname "$0")" && pwd +} -UBLK_PROG=$(pwd)/kublk +UBLK_PROG=$(_ublk_test_top_dir)/kublk +UBLK_TEST_QUIET=1 export UBLK_PROG +export UBLK_TEST_QUIET diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index 12bba9e5daa59..c882d2a08e130 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -. test_common.sh +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh TID="loop_01" ERR_CODE=0 diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh index 9a163296ac83f..03863d825e07d 100755 --- a/tools/testing/selftests/ublk/test_loop_02.sh +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -. test_common.sh +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh TID="loop_02" ERR_CODE=0 diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index 72a1d072cfbd9..269c96787d7d3 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -. test_common.sh +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh TID="loop_03" ERR_CODE=0 diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh index 676c4652d758d..1435422c38ec8 100755 --- a/tools/testing/selftests/ublk/test_loop_04.sh +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -. test_common.sh +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh TID="loop_04" ERR_CODE=0 diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index e2847a50823a8..a34203f726685 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -. test_common.sh +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh TID="null_01" ERR_CODE=0 diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh index c1cdde3e79f76..7177f6c57bc5b 100755 --- a/tools/testing/selftests/ublk/test_stress_01.sh +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -. test_common.sh +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh TID="stress_01" ERR_CODE=0 DEV_ID=-1 @@ -17,7 +17,7 @@ ublk_io_and_remove() DEV_ID=$(_add_ublk_dev "$@") _check_add_dev $TID $? "${backfile}" - echo "run ublk IO vs. remove device(ublk add $*)" + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs. remove device(ublk add $*)" if ! __run_io_and_remove "${DEV_ID}" "${size}" "no"; then echo "/dev/ublkc${DEV_ID} isn't removed" _remove_backfile "${backfile}" diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh index ec758f649a97c..2a8e60579a064 100755 --- a/tools/testing/selftests/ublk/test_stress_02.sh +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -. test_common.sh +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh TID="stress_02" ERR_CODE=0 DEV_ID=-1 @@ -17,7 +17,7 @@ ublk_io_and_kill_daemon() DEV_ID=$(_add_ublk_dev "$@") _check_add_dev $TID $? "${backfile}" - echo "run ublk IO vs kill ublk server(ublk add $*)" + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs kill ublk server(ublk add $*)" if ! __run_io_and_remove "${DEV_ID}" "${size}" "yes"; then echo "/dev/ublkc${DEV_ID} isn't removed res ${res}" _remove_backfile "${backfile}" From 334f795ff8fc061db448d205a252880a19d7c045 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 10 Mar 2025 11:48:25 -0700 Subject: [PATCH 093/108] Revert "io_uring/rsrc: simplify the bvec iter count calculation" This reverts commit 2a51c327d4a4a2eb62d67f4ea13a17efd0f25c5c. The kernel registered bvecs do use the iov_iter_advance() API, so we can't rely on this simplification anymore. Fixes: 27cb27b6d5ea40 ("io_uring: add support for kernel registered bvecs") Reported-by: Caleb Sander Mateos Signed-off-by: Keith Busch Reviewed-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250310184825.569371-1-kbusch@meta.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5dd1e08275594..5fff6ba2b7c05 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1024,7 +1024,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); if (offset) { /* @@ -1051,6 +1051,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, * to use the slow iter advance. */ if (offset < bvec->bv_len) { + iter->count -= offset; iter->iov_offset = offset; } else if (imu->is_kbuf) { iov_iter_advance(iter, offset); @@ -1063,6 +1064,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, iter->bvec += seg_skip; iter->nr_segs -= seg_skip; + iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } From cf9536e550dd243a1681fdbf804221527da20a80 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Mar 2025 14:01:49 -0600 Subject: [PATCH 094/108] io_uring/kbuf: enable bundles for incrementally consumed buffers The original support for incrementally consumed buffers didn't allow it to be used with bundles, with the assumption being that incremental buffers are generally larger, and hence there's less of a nedd to support it. But that assumption may not be correct - it's perfectly viable to use smaller buffers with incremental consumption, and there may be valid reasons for an application or framework to do so. As there's really no need to explicitly disable bundles with incrementally consumed buffers, allow it. This actually makes the peek side cheaper and simpler, with the completion side basically the same, just needing to iterate for the consumed length. Reported-by: Norman Maurer Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 56 +++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 3478be6d02abb..0981092596713 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -32,6 +32,25 @@ struct io_provide_buf { __u16 bid; }; +static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) +{ + while (len) { + struct io_uring_buf *buf; + u32 this_len; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + this_len = min_t(int, len, buf->len); + buf->len -= this_len; + if (buf->len) { + buf->addr += this_len; + return false; + } + bl->head++; + len -= this_len; + } + return true; +} + bool io_kbuf_commit(struct io_kiocb *req, struct io_buffer_list *bl, int len, int nr) { @@ -42,20 +61,8 @@ bool io_kbuf_commit(struct io_kiocb *req, if (unlikely(len < 0)) return true; - - if (bl->flags & IOBL_INC) { - struct io_uring_buf *buf; - - buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); - if (WARN_ON_ONCE(len > buf->len)) - len = buf->len; - buf->len -= len; - if (buf->len) { - buf->addr += len; - return false; - } - } - + if (bl->flags & IOBL_INC) + return io_kbuf_inc_commit(bl, len); bl->head += nr; return true; } @@ -226,25 +233,14 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, buf = io_ring_head_to_buf(br, head, bl->mask); if (arg->max_len) { u32 len = READ_ONCE(buf->len); + size_t needed; if (unlikely(!len)) return -ENOBUFS; - /* - * Limit incremental buffers to 1 segment. No point trying - * to peek ahead and map more than we need, when the buffers - * themselves should be large when setup with - * IOU_PBUF_RING_INC. - */ - if (bl->flags & IOBL_INC) { - nr_avail = 1; - } else { - size_t needed; - - needed = (arg->max_len + len - 1) / len; - needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); - if (nr_avail > needed) - nr_avail = needed; - } + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; } /* From 8764c1a72bd5019727a451b5ed5e50b0ae5fbb5f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 20 Mar 2025 09:37:33 +0800 Subject: [PATCH 095/108] selftests: ublk: add one dependency header Add one dependency helper which can include new uapi definition which isn't synced from kernel. This way also helps a lot for downstream test deployment. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250320013743.4167489-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.h | 1 + tools/testing/selftests/ublk/ublk_dep.h | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 tools/testing/selftests/ublk/ublk_dep.h diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 26d9aa9c5ca29..3ff9ac5104a70 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -21,6 +21,7 @@ #include #include #include +#include "ublk_dep.h" #define __maybe_unused __attribute__((unused)) #define MAX_BACK_FILES 4 diff --git a/tools/testing/selftests/ublk/ublk_dep.h b/tools/testing/selftests/ublk/ublk_dep.h new file mode 100644 index 0000000000000..f68fa7eac9397 --- /dev/null +++ b/tools/testing/selftests/ublk/ublk_dep.h @@ -0,0 +1,18 @@ +#ifndef UBLK_DEP_H +#define UBLK_DEP_H + +#ifndef UBLK_U_IO_REGISTER_IO_BUF +#define UBLK_U_IO_REGISTER_IO_BUF \ + _IOWR('u', 0x23, struct ublksrv_io_cmd) +#define UBLK_U_IO_UNREGISTER_IO_BUF \ + _IOWR('u', 0x24, struct ublksrv_io_cmd) +#endif + +#ifndef UBLK_F_USER_RECOVERY_FAIL_IO +#define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9) +#endif + +#ifndef UBLK_F_ZONED +#define UBLK_F_ZONED (1ULL << 8) +#endif +#endif From fe2230d9216093c984f75594aee97811faa2d59e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 20 Mar 2025 09:37:34 +0800 Subject: [PATCH 096/108] selftests: ublk: don't show `modprobe` failure ublk_drv may be built-in, so don't show modprobe failure, and we do check `/dev/ublk-control` for skipping test if ublk_drv isn't enabled. Reported-by: Jens Axboe Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250320013743.4167489-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 350380facd9ff..c86363c5cc7ea 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -64,7 +64,7 @@ _check_root() { _remove_ublk_devices() { ${UBLK_PROG} del -a - modprobe -r ublk_drv + modprobe -r ublk_drv > /dev/null 2>&1 } _get_ublk_dev_state() { @@ -79,7 +79,7 @@ _prep_test() { _check_root local type=$1 shift 1 - modprobe ublk_drv + modprobe ublk_drv > /dev/null 2>&1 [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" } From beb31982ad6b77249bf8535e71da2629af92b458 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 20 Mar 2025 09:37:35 +0800 Subject: [PATCH 097/108] selftests: ublk: add variable for user to not show test result Some user decides test result by exit code only, and wouldn't like to be bothered by the test result. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250320013743.4167489-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index c86363c5cc7ea..48fca609e741a 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -94,12 +94,14 @@ _remove_test_files() _show_result() { - if [ "$2" -eq 0 ]; then - echo "$1 : [PASS]" - elif [ "$2" -eq 4 ]; then - echo "$1 : [SKIP]" - else - echo "$1 : [FAIL]" + if [ "$UBLK_TEST_SHOW_RESULT" -ne 0 ]; then + if [ "$2" -eq 0 ]; then + echo "$1 : [PASS]" + elif [ "$2" -eq 4 ]; then + echo "$1 : [SKIP]" + else + echo "$1 : [FAIL]" + fi fi [ "$2" -ne 0 ] && exit "$2" return 0 @@ -216,5 +218,7 @@ _ublk_test_top_dir() UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 +UBLK_TEST_SHOW_RESULT=1 export UBLK_PROG export UBLK_TEST_QUIET +export UBLK_TEST_SHOW_RESULT From 96af5af47b5407972689929543c73a39b477c8ba Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Mar 2025 08:47:58 +0800 Subject: [PATCH 098/108] selftests: ublk: fix write cache implementation For loop target, write cache isn't enabled, and each write isn't be marked as DSYNC too. Fix it by enabling write cache, meantime fix FLUSH implementation by not taking LBA range into account, and there isn't such info for FLUSH command. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250321004758.152572-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 38e68b414962d..8a07356eccaf6 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -123,10 +123,7 @@ static int loop_queue_tgt_io(struct ublk_queue *q, int tag) sqe = ublk_queue_alloc_sqe(q); if (!sqe) return -ENOMEM; - io_uring_prep_sync_file_range(sqe, 1 /*fds[1]*/, - iod->nr_sectors << 9, - iod->start_sector << 9, - IORING_FSYNC_DATASYNC); + io_uring_prep_fsync(sqe, 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); q->io_inflight++; sqe->user_data = build_user_data(tag, ublk_op, UBLK_IO_TGT_NORMAL, 1); @@ -187,6 +184,7 @@ static int ublk_loop_tgt_init(struct ublk_dev *dev) struct ublk_params p = { .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, .basic = { + .attrs = UBLK_ATTR_VOLATILE_CACHE, .logical_bs_shift = 9, .physical_bs_shift = 12, .io_opt_shift = 12, From 07754bfd9aee59063f8549f6e4d455eae636ecc7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Mar 2025 10:19:01 -0600 Subject: [PATCH 099/108] io_uring: enable toggle of iowait usage when waiting on CQEs By default, io_uring marks a waiting task as being in iowait, if it's sleeping waiting on events and there are pending requests. This isn't necessarily always useful, and may be confusing on non-storage setups where iowait isn't expected. It can also cause extra power usage, by preventing the CPU from entering lower sleep states. This adds a new enter flag, IORING_ENTER_NO_IOWAIT. If set, then io_uring will not account the sleeping task as being in iowait. If the kernel supports this feature, then it will be marked by having the IORING_FEAT_NO_IOWAIT feature flag set. As the kernel currently does not support separating the iowait accounting and CPU frequency boosting, the IORING_ENTER_NO_IOWAIT controls both of these at the same time. In the future, if those do end up being split, then it'd be possible to control them separately. However, it seems more likely that the kernel will decouple iowait and CPU frequency boosting anyway. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 ++ io_uring/io_uring.c | 32 +++++++++++++++++++------------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 050fa8eb2e8f8..0d6c83c8d1cf6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -541,6 +541,7 @@ struct io_cqring_offsets { #define IORING_ENTER_REGISTERED_RING (1U << 4) #define IORING_ENTER_ABS_TIMER (1U << 5) #define IORING_ENTER_EXT_ARG_REG (1U << 6) +#define IORING_ENTER_NO_IOWAIT (1U << 7) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -578,6 +579,7 @@ struct io_uring_params { #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) #define IORING_FEAT_MIN_TIMEOUT (1U << 15) #define IORING_FEAT_RW_ATTR (1U << 16) +#define IORING_FEAT_NO_IOWAIT (1U << 17) /* * io_uring_register(2) opcodes and arguments diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 58003fa6b327b..d975e68e91f2a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2485,8 +2485,18 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; } +struct ext_arg { + size_t argsz; + struct timespec64 ts; + const sigset_t __user *sig; + ktime_t min_time; + bool ts_set; + bool iowait; +}; + static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct ext_arg *ext_arg, ktime_t start_time) { int ret = 0; @@ -2496,7 +2506,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ - if (current_pending_io()) + if (ext_arg->iowait && current_pending_io()) current->in_iowait = 1; if (iowq->timeout != KTIME_MAX || iowq->min_timeout) ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); @@ -2509,6 +2519,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, /* If this returns > 0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct ext_arg *ext_arg, ktime_t start_time) { if (unlikely(READ_ONCE(ctx->check_cq))) @@ -2522,17 +2533,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (unlikely(io_should_wake(iowq))) return 0; - return __io_cqring_wait_schedule(ctx, iowq, start_time); + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); } -struct ext_arg { - size_t argsz; - struct timespec64 ts; - const sigset_t __user *sig; - ktime_t min_time; - bool ts_set; -}; - /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -2610,7 +2613,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, TASK_INTERRUPTIBLE); } - ret = io_cqring_wait_schedule(ctx, &iowq, start_time); + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); @@ -3261,6 +3264,8 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; + ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT); + /* * If EXT_ARG isn't set, then we have no timespec and the argp pointer * is just a pointer to the sigset_t. @@ -3338,7 +3343,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | IORING_ENTER_REGISTERED_RING | IORING_ENTER_ABS_TIMER | - IORING_ENTER_EXT_ARG_REG))) + IORING_ENTER_EXT_ARG_REG | + IORING_ENTER_NO_IOWAIT))) return -EINVAL; /* @@ -3752,7 +3758,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | - IORING_FEAT_RW_ATTR; + IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; From ffde32a49a145a27af07e6acfb0c1d83c26479c4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Mar 2025 21:53:24 +0800 Subject: [PATCH 100/108] selftests: ublk: fix starting ublk device Firstly ublk char device node may not be created by udev yet, so wait a while until it can be opened or timeout. Secondly delete created ublk device in case of start failure, otherwise the device becomes zombie. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250321135324.259677-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 4 ++- tools/testing/selftests/ublk/kublk.c | 30 ++++++++++++++-------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 8a07356eccaf6..570a5158b6655 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -196,11 +196,13 @@ static int ublk_loop_tgt_init(struct ublk_dev *dev) }, }; - assert(dev->tgt.nr_backing_files == 1); ret = backing_file_tgt_init(dev); if (ret) return ret; + if (dev->tgt.nr_backing_files != 1) + return -EINVAL; + bytes = dev->tgt.backing_file_size[0]; dev->tgt.dev_size = bytes; p.basic.dev_sectors = bytes >> 9; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 148355717ee72..11005a87bcfaa 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -379,26 +379,34 @@ static int ublk_queue_init(struct ublk_queue *q) return -ENOMEM; } +#define WAIT_USEC 100000 +#define MAX_WAIT_USEC (3 * 1000000) static int ublk_dev_prep(struct ublk_dev *dev) { int dev_id = dev->dev_info.dev_id; + unsigned int wait_usec = 0; + int ret = 0, fd = -1; char buf[64]; - int ret = 0; snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id); - dev->fds[0] = open(buf, O_RDWR); - if (dev->fds[0] < 0) { - ret = -EBADF; - ublk_err("can't open %s, ret %d\n", buf, dev->fds[0]); - goto fail; + + while (wait_usec < MAX_WAIT_USEC) { + fd = open(buf, O_RDWR); + if (fd >= 0) + break; + usleep(WAIT_USEC); + wait_usec += WAIT_USEC; + } + if (fd < 0) { + ublk_err("can't open %s %s\n", buf, strerror(errno)); + return -1; } + dev->fds[0] = fd; if (dev->tgt.ops->init_tgt) ret = dev->tgt.ops->init_tgt(dev); - - return ret; -fail: - close(dev->fds[0]); + if (ret) + close(dev->fds[0]); return ret; } @@ -856,6 +864,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) ret = ublk_start_daemon(ctx, dev); ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret); + if (ret < 0) + ublk_ctrl_del_dev(dev); fail: if (ret < 0) From 723977cab4c0fdcf5ba08da9e30a6ad72efa2464 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 22 Mar 2025 17:32:09 +0800 Subject: [PATCH 101/108] selftests: ublk: add generic_01 for verifying sequential IO order block layer, ublk and io_uring might re-order IO in the past - plug - queue ublk io command via task work Add one test for verifying if sequential WRITE IO is dispatched in order. - null target is taken, so we can just observe io order from `tracepoint:block:block_rq_complete` which represents the dispatch order - WRITE IO is taken because READ may come from system-wide utility Cc: Uday Shankar Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250322093218.431419-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 4 +- tools/testing/selftests/ublk/test_common.sh | 22 ++++++++++ .../testing/selftests/ublk/test_generic_01.sh | 44 +++++++++++++++++++ tools/testing/selftests/ublk/trace/seq_io.bt | 25 +++++++++++ 4 files changed, 94 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/ublk/test_generic_01.sh create mode 100644 tools/testing/selftests/ublk/trace/seq_io.bt diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 5d8d5939f051e..652ab40adb733 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -3,7 +3,9 @@ CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir) LDLIBS += -lpthread -lm -luring -TEST_PROGS := test_null_01.sh +TEST_PROGS := test_generic_01.sh + +TEST_PROGS += test_null_01.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 48fca609e741a..75f54ac6b1c4d 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -3,6 +3,26 @@ UBLK_SKIP_CODE=4 +_have_program() { + if command -v "$1" >/dev/null 2>&1; then + return 0 + fi + return 1 +} + +_get_disk_dev_t() { + local dev_id=$1 + local dev + local major + local minor + + dev=/dev/ublkb"${dev_id}" + major=$(stat -c '%Hr' "$dev") + minor=$(stat -c '%Lr' "$dev") + + echo $(( (major & 0xfff) << 20 | (minor & 0xfffff) )) +} + _create_backfile() { local my_size=$1 local my_file @@ -121,6 +141,7 @@ _check_add_dev() _cleanup_test() { "${UBLK_PROG}" del -a + rm -f "$UBLK_TMP" } _have_feature() @@ -216,6 +237,7 @@ _ublk_test_top_dir() cd "$(dirname "$0")" && pwd } +UBLK_TMP=$(mktemp ublk_test_XXXXX) UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh new file mode 100755 index 0000000000000..9227a208ba531 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_01.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_01" +ERR_CODE=0 + +if ! _have_program bpftrace; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "null" "sequential io order" + +dev_id=$(_add_ublk_dev -t null) +_check_add_dev $TID $? + +dev_t=$(_get_disk_dev_t "$dev_id") +bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & +btrace_pid=$! +sleep 2 + +if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# run fio over this ublk disk +fio --name=write_seq \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=16 \ + --rw=write \ + --size=512M \ + --direct=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? +kill "$btrace_pid" +wait +if grep -q "io_out_of_order" "$UBLK_TMP"; then + cat "$UBLK_TMP" + ERR_CODE=255 +fi +_cleanup_test "null" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt b/tools/testing/selftests/ublk/trace/seq_io.bt new file mode 100644 index 0000000000000..272ac54c9d5fa --- /dev/null +++ b/tools/testing/selftests/ublk/trace/seq_io.bt @@ -0,0 +1,25 @@ +/* + $1: dev_t + $2: RWBS + $3: strlen($2) +*/ +BEGIN { + @last_rw[$1, str($2)] = 0; +} +tracepoint:block:block_rq_complete +{ + $dev = $1; + if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { + $last = @last_rw[$dev, str($2)]; + if ((uint64)args.sector != $last) { + printf("io_out_of_order: exp %llu actual %llu\n", + args.sector, $last); + } + @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); + } + @ios = count(); +} + +END { + clear(@last_rw); +} From f2639ed11e256b957690e241bb04ec9912367d60 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 22 Mar 2025 17:32:10 +0800 Subject: [PATCH 102/108] selftests: ublk: add single sqe allocator helper Unify the sqe allocator helper, and we will use it for supporting more cases, such as ublk stripe, in which variable sqe allocation is required. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250322093218.431419-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 50 +++++++++++----------- tools/testing/selftests/ublk/kublk.c | 20 ++++----- tools/testing/selftests/ublk/kublk.h | 26 +++++------ 3 files changed, 44 insertions(+), 52 deletions(-) diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 570a5158b6655..f58fa4ec9b514 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -69,44 +69,42 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de { int zc = ublk_queue_use_zc(q); enum io_uring_op op = ublk_to_uring_op(iod, zc); - struct io_uring_sqe *reg; - struct io_uring_sqe *rw; - struct io_uring_sqe *ureg; + struct io_uring_sqe *sqe[3]; if (!zc) { - rw = ublk_queue_alloc_sqe(q); - if (!rw) + ublk_queue_alloc_sqes(q, sqe, 1); + if (!sqe[0]) return -ENOMEM; - io_uring_prep_rw(op, rw, 1 /*fds[1]*/, + io_uring_prep_rw(op, sqe[0], 1 /*fds[1]*/, (void *)iod->addr, iod->nr_sectors << 9, iod->start_sector << 9); - io_uring_sqe_set_flags(rw, IOSQE_FIXED_FILE); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); q->io_inflight++; /* bit63 marks us as tgt io */ - rw->user_data = build_user_data(tag, op, UBLK_IO_TGT_NORMAL, 1); + sqe[0]->user_data = build_user_data(tag, op, UBLK_IO_TGT_NORMAL, 1); return 0; } - ublk_queue_alloc_sqe3(q, ®, &rw, &ureg); + ublk_queue_alloc_sqes(q, sqe, 3); - io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag); - reg->user_data = build_user_data(tag, 0xfe, 1, 1); - reg->flags |= IOSQE_CQE_SKIP_SUCCESS; - reg->flags |= IOSQE_IO_LINK; + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); + sqe[0]->user_data = build_user_data(tag, 0xfe, 1, 1); + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS; + sqe[0]->flags |= IOSQE_IO_LINK; - io_uring_prep_rw(op, rw, 1 /*fds[1]*/, 0, + io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0, iod->nr_sectors << 9, iod->start_sector << 9); - rw->buf_index = tag; - rw->flags |= IOSQE_FIXED_FILE; - rw->flags |= IOSQE_IO_LINK; - rw->user_data = build_user_data(tag, op, UBLK_IO_TGT_ZC_OP, 1); + sqe[1]->buf_index = tag; + sqe[1]->flags |= IOSQE_FIXED_FILE; + sqe[1]->flags |= IOSQE_IO_LINK; + sqe[1]->user_data = build_user_data(tag, op, UBLK_IO_TGT_ZC_OP, 1); q->io_inflight++; - io_uring_prep_buf_unregister(ureg, 0, tag, q->q_id, tag); - ureg->user_data = build_user_data(tag, 0xff, UBLK_IO_TGT_ZC_BUF, 1); + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); + sqe[2]->user_data = build_user_data(tag, 0xff, UBLK_IO_TGT_ZC_BUF, 1); q->io_inflight++; return 0; @@ -116,17 +114,17 @@ static int loop_queue_tgt_io(struct ublk_queue *q, int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); unsigned ublk_op = ublksrv_get_op(iod); - struct io_uring_sqe *sqe; + struct io_uring_sqe *sqe[1]; switch (ublk_op) { case UBLK_IO_OP_FLUSH: - sqe = ublk_queue_alloc_sqe(q); - if (!sqe) + ublk_queue_alloc_sqes(q, sqe, 1); + if (!sqe[0]) return -ENOMEM; - io_uring_prep_fsync(sqe, 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); - io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); + io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); q->io_inflight++; - sqe->user_data = build_user_data(tag, ublk_op, UBLK_IO_TGT_NORMAL, 1); + sqe[0]->user_data = build_user_data(tag, ublk_op, UBLK_IO_TGT_NORMAL, 1); break; case UBLK_IO_OP_WRITE_ZEROES: case UBLK_IO_OP_DISCARD: diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 11005a87bcfaa..0080cad1f3aec 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -420,7 +420,7 @@ static void ublk_dev_unprep(struct ublk_dev *dev) int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) { struct ublksrv_io_cmd *cmd; - struct io_uring_sqe *sqe; + struct io_uring_sqe *sqe[1]; unsigned int cmd_op = 0; __u64 user_data; @@ -441,24 +441,24 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) if (io_uring_sq_space_left(&q->ring) < 1) io_uring_submit(&q->ring); - sqe = ublk_queue_alloc_sqe(q); - if (!sqe) { + ublk_queue_alloc_sqes(q, sqe, 1); + if (!sqe[0]) { ublk_err("%s: run out of sqe %d, tag %d\n", __func__, q->q_id, tag); return -1; } - cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe); + cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]); if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ) cmd->result = io->result; /* These fields should be written once, never change */ - ublk_set_sqe_cmd_op(sqe, cmd_op); - sqe->fd = 0; /* dev->fds[0] */ - sqe->opcode = IORING_OP_URING_CMD; - sqe->flags = IOSQE_FIXED_FILE; - sqe->rw_flags = 0; + ublk_set_sqe_cmd_op(sqe[0], cmd_op); + sqe[0]->fd = 0; /* dev->fds[0] */ + sqe[0]->opcode = IORING_OP_URING_CMD; + sqe[0]->flags = IOSQE_FIXED_FILE; + sqe[0]->rw_flags = 0; cmd->tag = tag; cmd->q_id = q->q_id; if (!(q->state & UBLKSRV_NO_BUF)) @@ -467,7 +467,7 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) cmd->addr = 0; user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0); - io_uring_sqe_set_data64(sqe, user_data); + io_uring_sqe_set_data64(sqe[0], user_data); io->flags = 0; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 3ff9ac5104a70..9cd7ab62f2582 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -221,28 +221,22 @@ static inline void ublk_dbg(int level, const char *fmt, ...) } } -static inline struct io_uring_sqe *ublk_queue_alloc_sqe(struct ublk_queue *q) +static inline int ublk_queue_alloc_sqes(struct ublk_queue *q, + struct io_uring_sqe *sqes[], int nr_sqes) { unsigned left = io_uring_sq_space_left(&q->ring); + int i; - if (left < 1) + if (left < nr_sqes) io_uring_submit(&q->ring); - return io_uring_get_sqe(&q->ring); -} - -static inline void ublk_queue_alloc_sqe3(struct ublk_queue *q, - struct io_uring_sqe **sqe1, struct io_uring_sqe **sqe2, - struct io_uring_sqe **sqe3) -{ - struct io_uring *r = &q->ring; - unsigned left = io_uring_sq_space_left(r); - if (left < 3) - io_uring_submit(r); + for (i = 0; i < nr_sqes; i++) { + sqes[i] = io_uring_get_sqe(&q->ring); + if (!sqes[i]) + return i; + } - *sqe1 = io_uring_get_sqe(r); - *sqe2 = io_uring_get_sqe(r); - *sqe3 = io_uring_get_sqe(r); + return nr_sqes; } static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, From 9413c0ca8e455efb16b81f2c99061f6eb3d38281 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 22 Mar 2025 17:32:11 +0800 Subject: [PATCH 103/108] selftests: ublk: increase max buffer size to 1MB Increase max buffer size to 1MB, and 64KB is too small to evaluate performance with builtin ublk server implementation. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250322093218.431419-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 9cd7ab62f2582..40b89dcf07048 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -40,7 +40,7 @@ /* queue idle timeout */ #define UBLKSRV_IO_IDLE_SECS 20 -#define UBLK_IO_MAX_BYTES 65536 +#define UBLK_IO_MAX_BYTES (1 << 20) #define UBLK_MAX_QUEUES 4 #define UBLK_QUEUE_DEPTH 128 From 10d962dae2f6b4a7d86579cc6fe9d8987117fa8f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 22 Mar 2025 17:32:12 +0800 Subject: [PATCH 104/108] selftests: ublk: move common code into common.c Move two functions for initializing & de-initializing backing file into common.c. Also move one common helper into kublk.h. Prepare for supporting ublk-stripe. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250322093218.431419-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 +- tools/testing/selftests/ublk/common.c | 55 ++++++++++++++++++++++ tools/testing/selftests/ublk/file_backed.c | 52 -------------------- tools/testing/selftests/ublk/kublk.h | 2 + 4 files changed, 58 insertions(+), 53 deletions(-) create mode 100644 tools/testing/selftests/ublk/common.c diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 652ab40adb733..03dae5184d08e 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -18,7 +18,7 @@ TEST_GEN_PROGS_EXTENDED = kublk include ../lib.mk -$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c +$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c check: shellcheck -x -f gcc *.sh diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c new file mode 100644 index 0000000000000..01580a6f85196 --- /dev/null +++ b/tools/testing/selftests/ublk/common.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kublk.h" + +void backing_file_tgt_deinit(struct ublk_dev *dev) +{ + int i; + + for (i = 1; i < dev->nr_fds; i++) { + fsync(dev->fds[i]); + close(dev->fds[i]); + } +} + +int backing_file_tgt_init(struct ublk_dev *dev) +{ + int fd, i; + + assert(dev->nr_fds == 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) { + char *file = dev->tgt.backing_file[i]; + unsigned long bytes; + struct stat st; + + ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); + + fd = open(file, O_RDWR | O_DIRECT); + if (fd < 0) { + ublk_err("%s: backing file %s can't be opened: %s\n", + __func__, file, strerror(errno)); + return -EBADF; + } + + if (fstat(fd, &st) < 0) { + close(fd); + return -EBADF; + } + + if (S_ISREG(st.st_mode)) + bytes = st.st_size; + else if (S_ISBLK(st.st_mode)) { + if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) + return -1; + } else { + return -EINVAL; + } + + dev->tgt.backing_file_size[i] = bytes; + dev->fds[dev->nr_fds] = fd; + dev->nr_fds += 1; + } + + return 0; +} diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index f58fa4ec9b514..a2e8793390a87 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -2,58 +2,6 @@ #include "kublk.h" -static void backing_file_tgt_deinit(struct ublk_dev *dev) -{ - int i; - - for (i = 1; i < dev->nr_fds; i++) { - fsync(dev->fds[i]); - close(dev->fds[i]); - } -} - -static int backing_file_tgt_init(struct ublk_dev *dev) -{ - int fd, i; - - assert(dev->nr_fds == 1); - - for (i = 0; i < dev->tgt.nr_backing_files; i++) { - char *file = dev->tgt.backing_file[i]; - unsigned long bytes; - struct stat st; - - ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); - - fd = open(file, O_RDWR | O_DIRECT); - if (fd < 0) { - ublk_err("%s: backing file %s can't be opened: %s\n", - __func__, file, strerror(errno)); - return -EBADF; - } - - if (fstat(fd, &st) < 0) { - close(fd); - return -EBADF; - } - - if (S_ISREG(st.st_mode)) - bytes = st.st_size; - else if (S_ISBLK(st.st_mode)) { - if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) - return -1; - } else { - return -EINVAL; - } - - dev->tgt.backing_file_size[i] = bytes; - dev->fds[dev->nr_fds] = fd; - dev->nr_fds += 1; - } - - return 0; -} - static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int zc) { unsigned ublk_op = ublksrv_get_op(iod); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 40b89dcf07048..eaadd7364e258 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -320,4 +320,6 @@ static inline int ublk_queue_use_zc(const struct ublk_queue *q) extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; +void backing_file_tgt_deinit(struct ublk_dev *dev); +int backing_file_tgt_init(struct ublk_dev *dev); #endif From 8842b72a821d4cd49281fa096c35f9fa630ec981 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 22 Mar 2025 17:32:13 +0800 Subject: [PATCH 105/108] selftests: ublk: prepare for supporting stripe target - pass 'truct dev_ctx *ctx' to target init function - add 'private_data' to 'struct ublk_dev' for storing target specific data - add 'private_data' to 'struct ublk_io' for storing per-IO data - add 'tgt_ios' to 'struct ublk_io' for counting how many io_uring ios for handling the current io command - add helper ublk_get_io() for supporting stripe target - add two helpers for simplifying target io handling Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250322093218.431419-6-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 2 +- tools/testing/selftests/ublk/kublk.c | 6 ++-- tools/testing/selftests/ublk/kublk.h | 34 +++++++++++++++++++++- tools/testing/selftests/ublk/null.c | 2 +- 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index a2e8793390a87..e2287eedaac8a 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -123,7 +123,7 @@ static void ublk_loop_io_done(struct ublk_queue *q, int tag, q->io_inflight--; } -static int ublk_loop_tgt_init(struct ublk_dev *dev) +static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { unsigned long long bytes; int ret; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 0080cad1f3aec..2dd17663ef300 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -381,7 +381,7 @@ static int ublk_queue_init(struct ublk_queue *q) #define WAIT_USEC 100000 #define MAX_WAIT_USEC (3 * 1000000) -static int ublk_dev_prep(struct ublk_dev *dev) +static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev) { int dev_id = dev->dev_info.dev_id; unsigned int wait_usec = 0; @@ -404,7 +404,7 @@ static int ublk_dev_prep(struct ublk_dev *dev) dev->fds[0] = fd; if (dev->tgt.ops->init_tgt) - ret = dev->tgt.ops->init_tgt(dev); + ret = dev->tgt.ops->init_tgt(ctx, dev); if (ret) close(dev->fds[0]); return ret; @@ -666,7 +666,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); - ret = ublk_dev_prep(dev); + ret = ublk_dev_prep(ctx, dev); if (ret) return ret; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index eaadd7364e258..4eee9ad2beadb 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -94,11 +94,14 @@ struct ublk_io { unsigned short refs; /* used by target code only */ int result; + + unsigned short tgt_ios; + void *private_data; }; struct ublk_tgt_ops { const char *name; - int (*init_tgt)(struct ublk_dev *); + int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *); void (*deinit_tgt)(struct ublk_dev *); int (*queue_io)(struct ublk_queue *, int tag); @@ -146,6 +149,8 @@ struct ublk_dev { int nr_fds; int ctrl_fd; struct io_uring ring; + + void *private_data; }; #ifndef offsetof @@ -303,6 +308,11 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) +{ + return &q->ios[tag]; +} + static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) { struct ublk_io *io = &q->ios[tag]; @@ -312,6 +322,28 @@ static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) return ublk_queue_io_cmd(q, io, tag); } +static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + q->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + +static inline int ublk_completed_tgt_io(struct ublk_queue *q, unsigned tag) +{ + struct ublk_io *io = ublk_get_io(q, tag); + + q->io_inflight--; + + return --io->tgt_ios == 0; +} + static inline int ublk_queue_use_zc(const struct ublk_queue *q) { return q->state & UBLKSRV_ZC; diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index b6ef16a8f5145..975a11db22fda 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -2,7 +2,7 @@ #include "kublk.h" -static int ublk_null_tgt_init(struct ublk_dev *dev) +static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; unsigned long dev_size = 250UL << 30; From 8cb9b971e2b6103c72faf765f64239f86ec9328f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 22 Mar 2025 17:32:14 +0800 Subject: [PATCH 106/108] selftests: ublk: enable zero copy for null target Enable zero copy for null target so that we can evaluate performance from zero copy or not. Also this should be the simplest ublk zero copy implementation, which can be served as zc example. Add test for covering 'add -t null -z'. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250322093218.431419-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/kublk.h | 5 ++ tools/testing/selftests/ublk/null.c | 70 +++++++++++++++++++- tools/testing/selftests/ublk/test_null_02.sh | 20 ++++++ 4 files changed, 95 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/ublk/test_null_02.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 03dae5184d08e..36f50c000e556 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -6,6 +6,7 @@ LDLIBS += -lpthread -lm -luring TEST_PROGS := test_generic_01.sh TEST_PROGS += test_null_01.sh +TEST_PROGS += test_null_02.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 4eee9ad2beadb..48ca160557102 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -198,6 +198,11 @@ static inline unsigned int user_data_to_tgt_data(__u64 user_data) return (user_data >> 24) & 0xffff; } +static inline unsigned short ublk_cmd_op_nr(unsigned int op) +{ + return _IOC_NR(op); +} + static inline void ublk_err(const char *fmt, ...) { va_list ap; diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 975a11db22fda..899875ff50fea 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -2,6 +2,14 @@ #include "kublk.h" +#ifndef IORING_NOP_INJECT_RESULT +#define IORING_NOP_INJECT_RESULT (1U << 0) +#endif + +#ifndef IORING_NOP_FIXED_BUFFER +#define IORING_NOP_FIXED_BUFFER (1U << 3) +#endif + static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; @@ -20,14 +28,73 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) }, }; + if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) + dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth; return 0; } +static int null_queue_zc_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned ublk_op = ublksrv_get_op(iod); + struct io_uring_sqe *sqe[3]; + + ublk_queue_alloc_sqes(q, sqe, 3); + + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); + sqe[0]->user_data = build_user_data(tag, + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; + + io_uring_prep_nop(sqe[1]); + sqe[1]->buf_index = tag; + sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; + sqe[1]->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; + sqe[1]->len = iod->nr_sectors << 9; /* injected result */ + sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1); + + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); + sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1); + + // buf register is marked as IOSQE_CQE_SKIP_SUCCESS + return 2; +} + +static void ublk_null_io_done(struct ublk_queue *q, int tag, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_io *io = ublk_get_io(q, tag); + + if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + if (!io->result) + io->result = cqe->res; + if (cqe->res < 0) + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } + + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) + io->tgt_ios += 1; + + if (ublk_completed_tgt_io(q, tag)) + ublk_complete_io(q, tag, io->result); +} + static int ublk_null_queue_io(struct ublk_queue *q, int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + int zc = ublk_queue_use_zc(q); + int queued; + + if (!zc) { + ublk_complete_io(q, tag, iod->nr_sectors << 9); + return 0; + } - ublk_complete_io(q, tag, iod->nr_sectors << 9); + queued = null_queue_zc_io(q, tag); + ublk_queued_tgt_io(q, tag, queued); return 0; } @@ -35,4 +102,5 @@ const struct ublk_tgt_ops null_tgt_ops = { .name = "null", .init_tgt = ublk_null_tgt_init, .queue_io = ublk_null_queue_io, + .tgt_io_done = ublk_null_io_done, }; diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh new file mode 100755 index 0000000000000..5633ca8766554 --- /dev/null +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="null_02" +ERR_CODE=0 + +_prep_test "null" "basic IO test with zero copy" + +dev_id=$(_add_ublk_dev -t null -z) +_check_add_dev $TID $? + +# run fio over the two disks +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "null" + +_show_result $TID $ERR_CODE From 263846eb431f31ca3f38846c374377b732abb26e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 22 Mar 2025 17:32:15 +0800 Subject: [PATCH 107/108] selftests: ublk: simplify loop io completion Use the added target io handling helpers for simplifying loop io completion. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250322093218.431419-8-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 91 +++++++++++----------- tools/testing/selftests/ublk/kublk.h | 4 - 2 files changed, 47 insertions(+), 48 deletions(-) diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index e2287eedaac8a..6f34eabfae979 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -13,8 +13,22 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int assert(0); } +static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + unsigned ublk_op = ublksrv_get_op(iod); + struct io_uring_sqe *sqe[1]; + + ublk_queue_alloc_sqes(q, sqe, 1); + io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); + /* bit63 marks us as tgt io */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1); + return 1; +} + static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) { + unsigned ublk_op = ublksrv_get_op(iod); int zc = ublk_queue_use_zc(q); enum io_uring_op op = ublk_to_uring_op(iod, zc); struct io_uring_sqe *sqe[3]; @@ -29,98 +43,87 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de iod->nr_sectors << 9, iod->start_sector << 9); io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); - q->io_inflight++; /* bit63 marks us as tgt io */ - sqe[0]->user_data = build_user_data(tag, op, UBLK_IO_TGT_NORMAL, 1); - return 0; + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1); + return 1; } ublk_queue_alloc_sqes(q, sqe, 3); io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); - sqe[0]->user_data = build_user_data(tag, 0xfe, 1, 1); - sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS; - sqe[0]->flags |= IOSQE_IO_LINK; + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; + sqe[0]->user_data = build_user_data(tag, + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0, iod->nr_sectors << 9, iod->start_sector << 9); sqe[1]->buf_index = tag; - sqe[1]->flags |= IOSQE_FIXED_FILE; - sqe[1]->flags |= IOSQE_IO_LINK; - sqe[1]->user_data = build_user_data(tag, op, UBLK_IO_TGT_ZC_OP, 1); - q->io_inflight++; + sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; + sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1); io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); - sqe[2]->user_data = build_user_data(tag, 0xff, UBLK_IO_TGT_ZC_BUF, 1); - q->io_inflight++; + sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1); - return 0; + return 2; } static int loop_queue_tgt_io(struct ublk_queue *q, int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); unsigned ublk_op = ublksrv_get_op(iod); - struct io_uring_sqe *sqe[1]; + int ret; switch (ublk_op) { case UBLK_IO_OP_FLUSH: - ublk_queue_alloc_sqes(q, sqe, 1); - if (!sqe[0]) - return -ENOMEM; - io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); - io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); - q->io_inflight++; - sqe[0]->user_data = build_user_data(tag, ublk_op, UBLK_IO_TGT_NORMAL, 1); + ret = loop_queue_flush_io(q, iod, tag); break; case UBLK_IO_OP_WRITE_ZEROES: case UBLK_IO_OP_DISCARD: - return -ENOTSUP; + ret = -ENOTSUP; + break; case UBLK_IO_OP_READ: case UBLK_IO_OP_WRITE: - loop_queue_tgt_rw_io(q, iod, tag); + ret = loop_queue_tgt_rw_io(q, iod, tag); break; default: - return -EINVAL; + ret = -EINVAL; + break; } ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u\n", __func__, tag, iod->op_flags, iod->start_sector, iod->nr_sectors << 9); - return 1; + return ret; } static int ublk_loop_queue_io(struct ublk_queue *q, int tag) { int queued = loop_queue_tgt_io(q, tag); - if (queued < 0) - ublk_complete_io(q, tag, queued); - + ublk_queued_tgt_io(q, tag, queued); return 0; } static void ublk_loop_io_done(struct ublk_queue *q, int tag, const struct io_uring_cqe *cqe) { - int cqe_tag = user_data_to_tag(cqe->user_data); - unsigned tgt_data = user_data_to_tgt_data(cqe->user_data); - int res = cqe->res; + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_io *io = ublk_get_io(q, tag); + + if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + if (!io->result) + io->result = cqe->res; + if (cqe->res < 0) + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } - if (res < 0 || tgt_data == UBLK_IO_TGT_NORMAL) - goto complete; + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) + io->tgt_ios += 1; - if (tgt_data == UBLK_IO_TGT_ZC_OP) { - ublk_set_io_res(q, tag, cqe->res); - goto exit; - } - assert(tgt_data == UBLK_IO_TGT_ZC_BUF); - res = ublk_get_io_res(q, tag); -complete: - assert(tag == cqe_tag); - ublk_complete_io(q, tag, res); -exit: - q->io_inflight--; + if (ublk_completed_tgt_io(q, tag)) + ublk_complete_io(q, tag, io->result); } static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 48ca160557102..02f0bff7918c7 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -44,10 +44,6 @@ #define UBLK_MAX_QUEUES 4 #define UBLK_QUEUE_DEPTH 128 -#define UBLK_IO_TGT_NORMAL 0 -#define UBLK_IO_TGT_ZC_BUF 1 -#define UBLK_IO_TGT_ZC_OP 2 - #define UBLK_DBG_DEV (1U << 0) #define UBLK_DBG_QUEUE (1U << 1) #define UBLK_DBG_IO_CMD (1U << 2) From 0f3ebf2d4bc0296c61543b2a729151d89c60e1ec Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 22 Mar 2025 17:32:16 +0800 Subject: [PATCH 108/108] selftests: ublk: add stripe target Add ublk stripe target which can take 1~4 underlying backing files or block device, with stripe size 4k ~ 512K. Add two basic tests(write verify & mkfs/mount/umount) over ublk/stripe. This target is helpful to cover multiple IOs aiming at same fixed/registered IO kernel buffer. It is also capable of verifying vectored registered (kernel)buffers in future for zero copy, so far it isn't supported yet. Todo: support vectored registered kernel buffer for ublk/zc. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250322093218.431419-9-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 4 +- tools/testing/selftests/ublk/kublk.c | 7 +- tools/testing/selftests/ublk/kublk.h | 12 + tools/testing/selftests/ublk/stripe.c | 318 ++++++++++++++++++ .../testing/selftests/ublk/test_stripe_01.sh | 34 ++ .../testing/selftests/ublk/test_stripe_02.sh | 24 ++ 6 files changed, 397 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/ublk/stripe.c create mode 100755 tools/testing/selftests/ublk/test_stripe_01.sh create mode 100755 tools/testing/selftests/ublk/test_stripe_02.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 36f50c000e556..7817afe290053 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -11,6 +11,8 @@ TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh TEST_PROGS += test_loop_04.sh +TEST_PROGS += test_stripe_01.sh +TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh @@ -19,7 +21,7 @@ TEST_GEN_PROGS_EXTENDED = kublk include ../lib.mk -$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c +$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c stripe.c check: shellcheck -x -f gcc *.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 2dd17663ef300..05147b53c3613 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -9,6 +9,7 @@ unsigned int ublk_dbg_mask = UBLK_LOG; static const struct ublk_tgt_ops *tgt_ops_list[] = { &null_tgt_ops, &loop_tgt_ops, + &stripe_tgt_ops, }; static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) @@ -1060,8 +1061,9 @@ int main(int argc, char *argv[]) { "depth", 1, NULL, 'd' }, { "debug_mask", 1, NULL, 0 }, { "quiet", 0, NULL, 0 }, - { "zero_copy", 1, NULL, 'z' }, + { "zero_copy", 0, NULL, 'z' }, { "foreground", 0, NULL, 0 }, + { "chunk_size", 1, NULL, 0 }, { 0, 0, 0, 0 } }; int option_idx, opt; @@ -1071,6 +1073,7 @@ int main(int argc, char *argv[]) .nr_hw_queues = 2, .dev_id = -1, .tgt_type = "unknown", + .chunk_size = 65536, /* def chunk size is 64K */ }; int ret = -EINVAL, i; @@ -1107,6 +1110,8 @@ int main(int argc, char *argv[]) ublk_dbg_mask = 0; if (!strcmp(longopts[option_idx].name, "foreground")) ctx.fg = 1; + if (!strcmp(longopts[option_idx].name, "chunk_size")) + ctx.chunk_size = strtol(optarg, NULL, 10); } } diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 02f0bff7918c7..f31a5c4d4143e 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include "ublk_dep.h" @@ -66,6 +67,9 @@ struct dev_ctx { unsigned int all:1; unsigned int fg:1; + /* stripe */ + unsigned int chunk_size; + int _evtfd; }; @@ -352,7 +356,15 @@ static inline int ublk_queue_use_zc(const struct ublk_queue *q) extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; +extern const struct ublk_tgt_ops stripe_tgt_ops; void backing_file_tgt_deinit(struct ublk_dev *dev); int backing_file_tgt_init(struct ublk_dev *dev); + +static inline unsigned int ilog2(unsigned int x) +{ + if (x == 0) + return 0; + return (sizeof(x) * 8 - 1) - __builtin_clz(x); +} #endif diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c new file mode 100644 index 0000000000000..98c564b12f3c9 --- /dev/null +++ b/tools/testing/selftests/ublk/stripe.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kublk.h" + +#define NR_STRIPE MAX_BACK_FILES + +struct stripe_conf { + unsigned nr_files; + unsigned shift; +}; + +struct stripe { + loff_t start; + unsigned nr_sects; + int seq; + + struct iovec *vec; + unsigned nr_vec; + unsigned cap; +}; + +struct stripe_array { + struct stripe s[NR_STRIPE]; + unsigned nr; + struct iovec _vec[]; +}; + +static inline const struct stripe_conf *get_chunk_shift(const struct ublk_queue *q) +{ + return (struct stripe_conf *)q->dev->private_data; +} + +static inline unsigned calculate_nr_vec(const struct stripe_conf *conf, + const struct ublksrv_io_desc *iod) +{ + const unsigned shift = conf->shift - 9; + const unsigned unit_sects = conf->nr_files << shift; + loff_t start = iod->start_sector; + loff_t end = start + iod->nr_sectors; + + return (end / unit_sects) - (start / unit_sects) + 1; +} + +static struct stripe_array *alloc_stripe_array(const struct stripe_conf *conf, + const struct ublksrv_io_desc *iod) +{ + unsigned nr_vecs = calculate_nr_vec(conf, iod); + unsigned total = nr_vecs * conf->nr_files; + struct stripe_array *s; + int i; + + s = malloc(sizeof(*s) + total * sizeof(struct iovec)); + + s->nr = 0; + for (i = 0; i < conf->nr_files; i++) { + struct stripe *t = &s->s[i]; + + t->nr_vec = 0; + t->vec = &s->_vec[i * nr_vecs]; + t->nr_sects = 0; + t->cap = nr_vecs; + } + + return s; +} + +static void free_stripe_array(struct stripe_array *s) +{ + free(s); +} + +static void calculate_stripe_array(const struct stripe_conf *conf, + const struct ublksrv_io_desc *iod, struct stripe_array *s) +{ + const unsigned shift = conf->shift - 9; + const unsigned chunk_sects = 1 << shift; + const unsigned unit_sects = conf->nr_files << shift; + off64_t start = iod->start_sector; + off64_t end = start + iod->nr_sectors; + unsigned long done = 0; + unsigned idx = 0; + + while (start < end) { + unsigned nr_sects = chunk_sects - (start & (chunk_sects - 1)); + loff_t unit_off = (start / unit_sects) * unit_sects; + unsigned seq = (start - unit_off) >> shift; + struct stripe *this = &s->s[idx]; + loff_t stripe_off = (unit_off / conf->nr_files) + + (start & (chunk_sects - 1)); + + if (nr_sects > end - start) + nr_sects = end - start; + if (this->nr_sects == 0) { + this->nr_sects = nr_sects; + this->start = stripe_off; + this->seq = seq; + s->nr += 1; + } else { + assert(seq == this->seq); + assert(this->start + this->nr_sects == stripe_off); + this->nr_sects += nr_sects; + } + + assert(this->nr_vec < this->cap); + this->vec[this->nr_vec].iov_base = (void *)(iod->addr + done); + this->vec[this->nr_vec++].iov_len = nr_sects << 9; + + start += nr_sects; + done += nr_sects << 9; + idx = (idx + 1) % conf->nr_files; + } +} + +static inline enum io_uring_op stripe_to_uring_op(const struct ublksrv_io_desc *iod) +{ + unsigned ublk_op = ublksrv_get_op(iod); + + if (ublk_op == UBLK_IO_OP_READ) + return IORING_OP_READV; + else if (ublk_op == UBLK_IO_OP_WRITE) + return IORING_OP_WRITEV; + assert(0); +} + +static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + const struct stripe_conf *conf = get_chunk_shift(q); + enum io_uring_op op = stripe_to_uring_op(iod); + struct io_uring_sqe *sqe[NR_STRIPE]; + struct stripe_array *s = alloc_stripe_array(conf, iod); + struct ublk_io *io = ublk_get_io(q, tag); + int i; + + io->private_data = s; + calculate_stripe_array(conf, iod, s); + + ublk_queue_alloc_sqes(q, sqe, s->nr); + for (i = 0; i < s->nr; i++) { + struct stripe *t = &s->s[i]; + + io_uring_prep_rw(op, sqe[i], + t->seq + 1, + (void *)t->vec, + t->nr_vec, + t->start << 9); + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + /* bit63 marks us as tgt io */ + sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i, 1); + } + return s->nr; +} + +static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + const struct stripe_conf *conf = get_chunk_shift(q); + struct io_uring_sqe *sqe[NR_STRIPE]; + int i; + + ublk_queue_alloc_sqes(q, sqe, conf->nr_files); + for (i = 0; i < conf->nr_files; i++) { + io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC); + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, 1); + } + return conf->nr_files; +} + +static int stripe_queue_tgt_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned ublk_op = ublksrv_get_op(iod); + int ret = 0; + + switch (ublk_op) { + case UBLK_IO_OP_FLUSH: + ret = handle_flush(q, iod, tag); + break; + case UBLK_IO_OP_WRITE_ZEROES: + case UBLK_IO_OP_DISCARD: + ret = -ENOTSUP; + break; + case UBLK_IO_OP_READ: + case UBLK_IO_OP_WRITE: + ret = stripe_queue_tgt_rw_io(q, iod, tag); + break; + default: + ret = -EINVAL; + break; + } + ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u ret %d\n", __func__, tag, + iod->op_flags, iod->start_sector, iod->nr_sectors << 9, ret); + return ret; +} + +static int ublk_stripe_queue_io(struct ublk_queue *q, int tag) +{ + int queued = stripe_queue_tgt_io(q, tag); + + ublk_queued_tgt_io(q, tag, queued); + return 0; +} + +static void ublk_stripe_io_done(struct ublk_queue *q, int tag, + const struct io_uring_cqe *cqe) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_io *io = ublk_get_io(q, tag); + int res = cqe->res; + + if (res < 0) { + if (!io->result) + io->result = res; + ublk_err("%s: io failure %d tag %u\n", __func__, res, tag); + } + + /* fail short READ/WRITE simply */ + if (op == UBLK_IO_OP_READ || op == UBLK_IO_OP_WRITE) { + unsigned seq = user_data_to_tgt_data(cqe->user_data); + struct stripe_array *s = io->private_data; + + if (res < s->s[seq].vec->iov_len) + io->result = -EIO; + } + + if (ublk_completed_tgt_io(q, tag)) { + int res = io->result; + + if (!res) + res = iod->nr_sectors << 9; + + ublk_complete_io(q, tag, res); + + free_stripe_array(io->private_data); + io->private_data = NULL; + } +} + +static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + struct ublk_params p = { + .types = UBLK_PARAM_TYPE_BASIC, + .basic = { + .attrs = UBLK_ATTR_VOLATILE_CACHE, + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, + }, + }; + unsigned chunk_size = ctx->chunk_size; + struct stripe_conf *conf; + unsigned chunk_shift; + loff_t bytes = 0; + int ret, i; + + if ((chunk_size & (chunk_size - 1)) || !chunk_size) { + ublk_err("invalid chunk size %u\n", chunk_size); + return -EINVAL; + } + + if (chunk_size < 4096 || chunk_size > 512 * 1024) { + ublk_err("invalid chunk size %u\n", chunk_size); + return -EINVAL; + } + + chunk_shift = ilog2(chunk_size); + + ret = backing_file_tgt_init(dev); + if (ret) + return ret; + + if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) + return -EINVAL; + + assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) + dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) { + unsigned long size = dev->tgt.backing_file_size[i]; + + if (size != dev->tgt.backing_file_size[0]) + return -EINVAL; + bytes += size; + } + + conf = malloc(sizeof(*conf)); + conf->shift = chunk_shift; + conf->nr_files = dev->tgt.nr_backing_files; + + dev->private_data = conf; + dev->tgt.dev_size = bytes; + p.basic.dev_sectors = bytes >> 9; + dev->tgt.params = p; + dev->tgt.sq_depth = dev->dev_info.queue_depth * conf->nr_files; + dev->tgt.cq_depth = dev->dev_info.queue_depth * conf->nr_files; + + printf("%s: shift %u files %u\n", __func__, conf->shift, conf->nr_files); + + return 0; +} + +static void ublk_stripe_tgt_deinit(struct ublk_dev *dev) +{ + free(dev->private_data); + backing_file_tgt_deinit(dev); +} + +const struct ublk_tgt_ops stripe_tgt_ops = { + .name = "stripe", + .init_tgt = ublk_stripe_tgt_init, + .deinit_tgt = ublk_stripe_tgt_deinit, + .queue_io = ublk_stripe_queue_io, + .tgt_io_done = ublk_stripe_io_done, +}; diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh new file mode 100755 index 0000000000000..c01f3dc325ab3 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="stripe_01" +ERR_CODE=0 + +_prep_test "stripe" "write and verify test" + +backfile_0=$(_create_backfile 256M) +backfile_1=$(_create_backfile 256M) + +dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") +_check_add_dev $TID $? "${backfile_0}" + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=32 \ + --rw=write \ + --size=512M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "stripe" + +_remove_backfile "$backfile_0" +_remove_backfile "$backfile_1" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh new file mode 100755 index 0000000000000..e8a45fa82dde0 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_02.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="stripe_02" +ERR_CODE=0 + +_prep_test "stripe" "mkfs & mount & umount" + +backfile_0=$(_create_backfile 256M) +backfile_1=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") +_check_add_dev $TID $? "$backfile_0" "$backfile_1" + +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "stripe" + +_remove_backfile "$backfile_0" +_remove_backfile "$backfile_1" + +_show_result $TID $ERR_CODE