From af82425c6a2d2f347c79b63ce74fca6dc6be157f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 2 Jan 2023 16:49:46 -0700 Subject: [PATCH 1/5] io_uring/io-wq: free worker if task_work creation is canceled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we cancel the task_work, the worker will never come into existance. As this is the last reference to it, ensure that we get it freed appropriately. Cc: stable@vger.kernel.org Reported-by: 진호 Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 6f1d0e5df23ad..992dcd9f8c4cf 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1230,6 +1230,7 @@ static void io_wq_cancel_tw_create(struct io_wq *wq) worker = container_of(cb, struct io_worker, create_work); io_worker_cancel_cb(worker); + kfree(worker); } } From 9ffa13ff78a0a55df968a72d6f0ebffccee5c9f4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Jan 2023 01:34:02 +0000 Subject: [PATCH 2/5] io_uring: pin context while queueing deferred tw Unlike normal tw, nothing prevents deferred tw to be executed right after an tw item added to ->work_llist in io_req_local_work_add(). For instance, the waiting task may get waken up by CQ posting or a normal tw. Thus we need to pin the ring for the rest of io_req_local_work_add() Cc: stable@vger.kernel.org Fixes: c0e0d6ba25f18 ("io_uring: add IORING_SETUP_DEFER_TASKRUN") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1a79362b9c10b8523ef70b061d96523650a23344.1672795998.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 58ac13b69dc8d..6bed448556798 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1236,13 +1236,18 @@ static void io_req_local_work_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) + percpu_ref_get(&ctx->refs); + + if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) { + percpu_ref_put(&ctx->refs); return; + } /* need it for the following io_cqring_wake() */ smp_mb__after_atomic(); if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { io_move_task_work_from_local(ctx); + percpu_ref_put(&ctx->refs); return; } @@ -1252,6 +1257,7 @@ static void io_req_local_work_add(struct io_kiocb *req) if (ctx->has_evfd) io_eventfd_signal(ctx); __io_cqring_wake(ctx); + percpu_ref_put(&ctx->refs); } void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) From f26cc9593581bd734c846bf827401350b36dc3c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Jan 2023 01:34:57 +0000 Subject: [PATCH 3/5] io_uring: lockdep annotate CQ locking Locking around CQE posting is complex and depends on options the ring is created with, add more thorough lockdep annotations checking all invariants. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/aa3770b4eacae3915d782cc2ab2f395a99b4b232.1672795976.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++--- io_uring/io_uring.h | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6bed448556798..472574192dd63 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -731,6 +731,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, size_t ocq_size = sizeof(struct io_overflow_cqe); bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + lockdep_assert_held(&ctx->completion_lock); + if (is_cqe32) ocq_size += sizeof(struct io_uring_cqe); @@ -820,9 +822,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, { struct io_uring_cqe *cqe; - if (!ctx->task_complete) - lockdep_assert_held(&ctx->completion_lock); - ctx->cq_extra++; /* diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e9f0d41ebb996..ab4b2a1c3b7e8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -79,6 +79,19 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); +#define io_lockdep_assert_cq_locked(ctx) \ + do { \ + if (ctx->flags & IORING_SETUP_IOPOLL) { \ + lockdep_assert_held(&ctx->uring_lock); \ + } else if (!ctx->task_complete) { \ + lockdep_assert_held(&ctx->completion_lock); \ + } else if (ctx->submitter_task->flags & PF_EXITING) { \ + lockdep_assert(current_work()); \ + } else { \ + lockdep_assert(current == ctx->submitter_task); \ + } \ + } while (0) + static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, true); @@ -92,6 +105,8 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx); static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, bool overflow) { + io_lockdep_assert_cq_locked(ctx); + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { struct io_uring_cqe *cqe = ctx->cqe_cached; From 59b745bb4e0bd445366c45b8df6b51b69134f4f5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jan 2023 13:49:54 -0700 Subject: [PATCH 4/5] io_uring: move 'poll_multi_queue' bool in io_ring_ctx The cacheline section holding this variable has two gaps, where one is caused by this bool not packing well with structs. This causes it to blow into the next cacheline. Move the variable, shrinking io_ring_ctx by a full cacheline in size. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dcd8a563ab522..128a67a40065f 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -292,6 +292,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; + bool poll_multi_queue; + /* * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -300,7 +302,6 @@ struct io_ring_ctx { */ struct io_wq_work_list iopoll_list; struct io_hash_table cancel_table; - bool poll_multi_queue; struct llist_head work_llist; From 12521a5d5cb7ff0ad43eadfc9c135d86e1131fa8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 5 Jan 2023 10:49:15 +0000 Subject: [PATCH 5/5] io_uring: fix CQ waiting timeout handling Jiffy to ktime CQ waiting conversion broke how we treat timeouts, in particular we rearm it anew every time we get into io_cqring_wait_schedule() without adjusting the timeout. Waiting for 2 CQEs and getting a task_work in the middle may double the timeout value, or even worse in some cases task may wait indefinitely. Cc: stable@vger.kernel.org Fixes: 228339662b398 ("io_uring: don't convert to jiffies for waiting on timeouts") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f7bffddd71b08f28a877d44d37ac953ddb01590d.1672915663.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 472574192dd63..2ac1cd8d23ea6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2470,7 +2470,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - ktime_t timeout) + ktime_t *timeout) { int ret; unsigned long check_cq; @@ -2488,7 +2488,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) return -EBADR; } - if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) + if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) return -ETIME; /* @@ -2564,7 +2564,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, timeout); + ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); if (__io_cqring_events_user(ctx) >= min_events) break; cond_resched();