From 38eddb2c75fb99b9cd78445094ca0e1bda08d102 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 16 Oct 2022 21:30:48 +0100 Subject: [PATCH 1/7] io_uring: remove FFS_SCM THe lifetime of SCM'ed files is bound to ring_sock, which is destroyed strictly after we're done with registered file tables. This means there is no need for the FFS_SCM hack, which was not available on 32-bit builds anyway. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/984226a1045adf42dc35d8bd7fb5a8bbfa472ce1.1665891182.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/filetable.h | 15 +-------------- io_uring/io_uring.c | 2 -- io_uring/rsrc.c | 7 ++----- io_uring/rsrc.h | 4 ---- 4 files changed, 3 insertions(+), 25 deletions(-) diff --git a/io_uring/filetable.h b/io_uring/filetable.h index ff3a712e11bf3..19d2aed66c72e 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -5,22 +5,9 @@ #include #include -/* - * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 - * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we - * can't safely always dereference the file when the task has exited and ring - * cleanup is done. If a file is tracked and part of SCM, then unix gc on - * process exit may reap it before __io_sqe_files_unregister() is run. - */ #define FFS_NOWAIT 0x1UL #define FFS_ISREG 0x2UL -#if defined(CONFIG_64BIT) -#define FFS_SCM 0x4UL -#else -#define IO_URING_SCM_ALL -#define FFS_SCM 0x0UL -#endif -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index de08d9902b30b..18aa39709faec 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1587,8 +1587,6 @@ unsigned int io_file_get_flags(struct file *file) res |= FFS_ISREG; if (__io_file_supports_nowait(file, mode)) res |= FFS_NOWAIT; - if (io_file_need_scm(file)) - res |= FFS_SCM; return res; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 012fdb04ec238..55d4ab96fb925 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -757,20 +757,17 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { -#if !defined(IO_URING_SCM_ALL) int i; for (i = 0; i < ctx->nr_user_files; i++) { struct file *file = io_file_from_index(&ctx->file_table, i); - if (!file) - continue; - if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) + /* skip scm accounted files, they'll be freed by ->ring_sock */ + if (!file || io_file_need_scm(file)) continue; io_file_bitmap_clear(&ctx->file_table, i); fput(file); } -#endif #if defined(CONFIG_UNIX) if (ctx->ring_sock) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 9bce15665444e..81445a477622b 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -82,11 +82,7 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file); #if defined(CONFIG_UNIX) static inline bool io_file_need_scm(struct file *filp) { -#if defined(IO_URING_SCM_ALL) - return true; -#else return !!unix_get_socket(filp); -#endif } #else static inline bool io_file_need_scm(struct file *filp) From 4d5059512d283dab7372d282c2fbd43c7f5a2456 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 16 Oct 2022 21:30:49 +0100 Subject: [PATCH 2/7] io_uring: kill hot path fixed file bitmap debug checks We test file_table.bitmap in io_file_get_fixed() to check invariants, don't do it, it's expensive and was showing up in profiles. No reports of this triggering has come in. Move the check to the file clear instead, which will still catch any wrong usage. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cf77f2ded68d2e5b2bc7355784d969837d48e023.1665891182.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/filetable.h | 1 + io_uring/io_uring.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 19d2aed66c72e..351111ff88827 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -25,6 +25,7 @@ unsigned int io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { + WARN_ON_ONCE(!test_bit(bit, table->bitmap)); __clear_bit(bit, table->bitmap); table->alloc_hint = bit; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 18aa39709faec..6e50f548de1a7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1858,7 +1858,6 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); io_req_set_rsrc_node(req, ctx, 0); - WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap)); out: io_ring_submit_unlock(ctx, issue_flags); return file; From 34f0bc427e94065e7f828e70690f8fe1e01b3a9d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 16 Oct 2022 21:30:50 +0100 Subject: [PATCH 3/7] io_uring: reuse io_alloc_req() Don't duplicate io_alloc_req() in io_req_caches_free() but reuse the helper. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6005fc88274864a49fc3096c22d8bdd605cf8576.1665891182.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6e50f548de1a7..62be51fbf39c4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2560,18 +2560,14 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_req_caches_free(struct io_ring_ctx *ctx) { - struct io_submit_state *state = &ctx->submit_state; int nr = 0; mutex_lock(&ctx->uring_lock); - io_flush_cached_locked_reqs(ctx, state); + io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { - struct io_wq_work_node *node; - struct io_kiocb *req; + struct io_kiocb *req = io_alloc_req(ctx); - node = wq_stack_extract(&state->free_list); - req = container_of(node, struct io_kiocb, comp_list); kmem_cache_free(req_cachep, req); nr++; } From 02bac94bd8efd75f615ac7515dd2def75b43e5b9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 16 Oct 2022 21:30:51 +0100 Subject: [PATCH 4/7] io_uring: don't iopoll from io_ring_ctx_wait_and_kill() We should not be completing requests from a task context that has already undergone io_uring cancellations, i.e. __io_uring_cancel(), as there are some assumptions, e.g. around cached task refs draining. Remove iopolling from io_ring_ctx_wait_and_kill() as it can be called later after PF_EXITING is set with the last task_work run. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7c03cc91455c4a1af49c6b9cbda4e57ea467aa11.1665891182.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 62be51fbf39c4..6cc16e39b27f0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2804,15 +2804,12 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); - /* failed during ring init, it couldn't have issued any requests */ - if (ctx->rings) { + /* + * If we failed setting up the ctx, we might not have any rings + * and therefore did not submit any requests + */ + if (ctx->rings) io_kill_timeouts(ctx, NULL, true); - /* if we failed setting up the ctx, we might not have any rings */ - io_iopoll_try_reap_events(ctx); - /* drop cached put refs after potentially doing completions */ - if (current->io_uring) - io_uring_drop_tctx_refs(current); - } INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* From 5c61795ea97c170347c5c4af0c159bd877b8af71 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 16 Oct 2022 17:24:10 -0600 Subject: [PATCH 5/7] io_uring/rw: remove leftover debug statement This debug statement was never meant to go into the upstream release, kill it off before it ends up in a release. It was just part of the testing for the initial version of the patch. Fixes: 2ec33a6c3cca ("io_uring/rw: ensure kiocb_end_write() is always called") Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 100de2626e478..bb47cc4da713c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -242,8 +242,6 @@ static void io_req_io_end(struct io_kiocb *req) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - WARN_ON(!in_task()); - if (rw->kiocb.ki_flags & IOCB_WRITE) { kiocb_end_write(req); fsnotify_modify(req->file); From 16bbdfe5fb0e78e0acb13e45fc127e9a296913f2 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Wed, 19 Oct 2022 10:12:18 -0700 Subject: [PATCH 6/7] io_uring/msg_ring: Fix NULL pointer dereference in io_msg_send_fd() Syzkaller produced the below call trace: BUG: KASAN: null-ptr-deref in io_msg_ring+0x3cb/0x9f0 Write of size 8 at addr 0000000000000070 by task repro/16399 CPU: 0 PID: 16399 Comm: repro Not tainted 6.1.0-rc1 #28 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 Call Trace: dump_stack_lvl+0xcd/0x134 ? io_msg_ring+0x3cb/0x9f0 kasan_report+0xbc/0xf0 ? io_msg_ring+0x3cb/0x9f0 kasan_check_range+0x140/0x190 io_msg_ring+0x3cb/0x9f0 ? io_msg_ring_prep+0x300/0x300 io_issue_sqe+0x698/0xca0 io_submit_sqes+0x92f/0x1c30 __do_sys_io_uring_enter+0xae4/0x24b0 .... RIP: 0033:0x7f2eaf8f8289 RSP: 002b:00007fff40939718 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f2eaf8f8289 RDX: 0000000000000000 RSI: 0000000000006f71 RDI: 0000000000000004 RBP: 00007fff409397a0 R08: 0000000000000000 R09: 0000000000000039 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000004006d0 R13: 00007fff40939880 R14: 0000000000000000 R15: 0000000000000000 Kernel panic - not syncing: panic_on_warn set ... We don't have a NULL check on file_ptr in io_msg_send_fd() function, so when file_ptr is NUL src_file is also NULL and get_file() dereferences a NULL pointer and leads to above crash. Add a NULL check to fix this issue. Fixes: e6130eba8a84 ("io_uring: add support for passing fixed file descriptors") Reported-by: syzkaller Signed-off-by: Harshit Mogalapalli Link: https://lore.kernel.org/r/20221019171218.1337614-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 4a7e5d030c782..90d2fc6fd80e4 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -95,6 +95,9 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files); file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr; + if (!file_ptr) + goto out_unlock; + src_file = (struct file *) (file_ptr & FFS_MASK); get_file(src_file); From 996d3efeb091c503afd3ee6b5e20eabf446fd955 Mon Sep 17 00:00:00 2001 From: Rafael Mendonca Date: Wed, 19 Oct 2022 22:47:09 -0300 Subject: [PATCH 7/7] io-wq: Fix memory leak in worker creation If the CPU mask allocation for a node fails, then the memory allocated for the 'io_wqe' struct of the current node doesn't get freed on the error handling path, since it has not yet been added to the 'wqes' array. This was spotted when fuzzing v6.1-rc1 with Syzkaller: BUG: memory leak unreferenced object 0xffff8880093d5000 (size 1024): comm "syz-executor.2", pid 7701, jiffies 4295048595 (age 13.900s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000cb463369>] __kmem_cache_alloc_node+0x18e/0x720 [<00000000147a3f9c>] kmalloc_node_trace+0x2a/0x130 [<000000004e107011>] io_wq_create+0x7b9/0xdc0 [<00000000c38b2018>] io_uring_alloc_task_context+0x31e/0x59d [<00000000867399da>] __io_uring_add_tctx_node.cold+0x19/0x1ba [<000000007e0e7a79>] io_uring_setup.cold+0x1b80/0x1dce [<00000000b545e9f6>] __x64_sys_io_uring_setup+0x5d/0x80 [<000000008a8a7508>] do_syscall_64+0x5d/0x90 [<000000004ac08bec>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 0e03496d1967 ("io-wq: use private CPU mask") Cc: stable@vger.kernel.org Signed-off-by: Rafael Mendonca Link: https://lore.kernel.org/r/20221020014710.902201-1-rafaelmendsr@gmail.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index c6536d4b2da0b..6f1d0e5df23ad 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1164,10 +1164,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); if (!wqe) goto err; + wq->wqes[node] = wqe; if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) goto err; cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); - wq->wqes[node] = wqe; wqe->node = alloc_node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =