From 42fb0a1e84ff525ebe560e2baf9451ab69127e2b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 20 Oct 2022 23:14:27 -0400 Subject: [PATCH 01/13] tracing/ring-buffer: Have polling block on watermark Currently the way polling works on the ring buffer is broken. It will return immediately if there's any data in the ring buffer whereas a read will block until the watermark (defined by the tracefs buffer_percent file) is hit. That is, a select() or poll() will return as if there's data available, but then the following read will block. This is broken for the way select()s and poll()s are supposed to work. Have the polling on the ring buffer also block the same way reads and splice does on the ring buffer. Link: https://lkml.kernel.org/r/20221020231427.41be3f26@gandalf.local.home Cc: Linux Trace Kernel Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Primiano Tucci Cc: stable@vger.kernel.org Fixes: 1e0d6714aceb7 ("ring-buffer: Do not wake up a splice waiter when page is not full") Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 55 ++++++++++++++++++++++++------------- kernel/trace/trace.c | 2 +- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 2504df9a0453e..3c7d295746f67 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -100,7 +100,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full); __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, - struct file *filp, poll_table *poll_table); + struct file *filp, poll_table *poll_table, int full); void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu); #define RING_BUFFER_ALL_CPUS -1 diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9712083832f41..089b1ec9cb3be 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -907,6 +907,21 @@ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) return cnt - read; } +static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + size_t nr_pages; + size_t dirty; + + nr_pages = cpu_buffer->nr_pages; + if (!nr_pages || !full) + return true; + + dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + + return (dirty * 100) > (full * nr_pages); +} + /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * @@ -1046,22 +1061,20 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) !ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; - size_t nr_pages; - size_t dirty; + bool done; if (!full) break; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - nr_pages = cpu_buffer->nr_pages; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + done = !pagebusy && full_hit(buffer, cpu, full); + if (!cpu_buffer->shortest_full || cpu_buffer->shortest_full > full) cpu_buffer->shortest_full = full; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (!pagebusy && - (!nr_pages || (dirty * 100) > full * nr_pages)) + if (done) break; } @@ -1087,6 +1100,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) * @cpu: the cpu buffer to wait on * @filp: the file descriptor * @poll_table: The poll descriptor + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise @@ -1096,14 +1110,15 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) * zero otherwise. */ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, - struct file *filp, poll_table *poll_table) + struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *work; - if (cpu == RING_BUFFER_ALL_CPUS) + if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; - else { + full = 0; + } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; @@ -1111,8 +1126,14 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, work = &cpu_buffer->irq_work; } - poll_wait(filp, &work->waiters, poll_table); - work->waiters_pending = true; + if (full) { + poll_wait(filp, &work->full_waiters, poll_table); + work->full_waiters_pending = true; + } else { + poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + } + /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit @@ -1128,6 +1149,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, */ smp_mb(); + if (full) + return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0; + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; @@ -3155,10 +3179,6 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { - size_t nr_pages; - size_t dirty; - size_t full; - if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -3182,10 +3202,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); - full = cpu_buffer->shortest_full; - nr_pages = cpu_buffer->nr_pages; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); - if (full && nr_pages && (dirty * 100) <= full * nr_pages) + if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full)) return; cpu_buffer->irq_work.wakeup_full = true; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 47a44b055a1d4..c6c7a0af3ed2f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6681,7 +6681,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl return EPOLLIN | EPOLLRDNORM; else return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file, - filp, poll_table); + filp, poll_table, iter->tr->buffer_percent); } static __poll_t From 31029a8b2c7e656a0289194ef16415050ae4c4ac Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 21 Oct 2022 12:30:13 -0400 Subject: [PATCH 02/13] ring-buffer: Include dropped pages in counting dirty patches The function ring_buffer_nr_dirty_pages() was created to find out how many pages are filled in the ring buffer. There's two running counters. One is incremented whenever a new page is touched (pages_touched) and the other is whenever a page is read (pages_read). The dirty count is the number touched minus the number read. This is used to determine if a blocked task should be woken up if the percentage of the ring buffer it is waiting for is hit. The problem is that it does not take into account dropped pages (when the new writes overwrite pages that were not read). And then the dirty pages will always be greater than the percentage. This makes the "buffer_percent" file inaccurate, as the number of dirty pages end up always being larger than the percentage, event when it's not and this causes user space to be woken up more than it wants to be. Add a new counter to keep track of lost pages, and include that in the accounting of dirty pages so that it is actually accurate. Link: https://lkml.kernel.org/r/20221021123013.55fb6055@gandalf.local.home Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 089b1ec9cb3be..a19369c4d8df3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -519,6 +519,7 @@ struct ring_buffer_per_cpu { local_t committing; local_t commits; local_t pages_touched; + local_t pages_lost; local_t pages_read; long last_pages_touch; size_t shortest_full; @@ -894,10 +895,18 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) { size_t read; + size_t lost; size_t cnt; read = local_read(&buffer->buffers[cpu]->pages_read); + lost = local_read(&buffer->buffers[cpu]->pages_lost); cnt = local_read(&buffer->buffers[cpu]->pages_touched); + + if (WARN_ON_ONCE(cnt < lost)) + return 0; + + cnt -= lost; + /* The reader can read an empty page, but not more than that */ if (cnt < read) { WARN_ON_ONCE(read > cnt + 1); @@ -2031,6 +2040,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) */ local_add(page_entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_inc(&cpu_buffer->pages_lost); } /* @@ -2515,6 +2525,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, */ local_add(entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_inc(&cpu_buffer->pages_lost); /* * The entries will be zeroed out when we move the @@ -5265,6 +5276,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); local_set(&cpu_buffer->pages_touched, 0); + local_set(&cpu_buffer->pages_lost, 0); local_set(&cpu_buffer->pages_read, 0); cpu_buffer->last_pages_touch = 0; cpu_buffer->shortest_full = 0; From 649e72070cbbb8600eb823833e4748f5a0815116 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 7 Nov 2022 19:04:50 +0800 Subject: [PATCH 03/13] tracing: Fix memory leak in tracing_read_pipe() kmemleak reports this issue: unreferenced object 0xffff888105a18900 (size 128): comm "test_progs", pid 18933, jiffies 4336275356 (age 22801.766s) hex dump (first 32 bytes): 25 73 00 90 81 88 ff ff 26 05 00 00 42 01 58 04 %s......&...B.X. 03 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000560143a1>] __kmalloc_node_track_caller+0x4a/0x140 [<000000006af00822>] krealloc+0x8d/0xf0 [<00000000c309be6a>] trace_iter_expand_format+0x99/0x150 [<000000005a53bdb6>] trace_check_vprintf+0x1e0/0x11d0 [<0000000065629d9d>] trace_event_printf+0xb6/0xf0 [<000000009a690dc7>] trace_raw_output_bpf_trace_printk+0x89/0xc0 [<00000000d22db172>] print_trace_line+0x73c/0x1480 [<00000000cdba76ba>] tracing_read_pipe+0x45c/0x9f0 [<0000000015b58459>] vfs_read+0x17b/0x7c0 [<000000004aeee8ed>] ksys_read+0xed/0x1c0 [<0000000063d3d898>] do_syscall_64+0x3b/0x90 [<00000000a06dda7f>] entry_SYSCALL_64_after_hwframe+0x63/0xcd iter->fmt alloced in tracing_read_pipe() -> .. ->trace_iter_expand_format(), but not freed, to fix, add free in tracing_release_pipe() Link: https://lkml.kernel.org/r/1667819090-4643-1-git-send-email-wangyufen@huawei.com Cc: stable@vger.kernel.org Fixes: efbbdaa22bb7 ("tracing: Show real address for trace event arguments") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Wang Yufen Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c6c7a0af3ed2f..5bd202d6d79a1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6657,6 +6657,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); + kfree(iter->fmt); mutex_destroy(&iter->mutex); kfree(iter); From bedf06833b1f63c2627bd5634602e05592129d7a Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Mon, 7 Nov 2022 21:35:56 +0530 Subject: [PATCH 04/13] tracing: Fix warning on variable 'struct trace_array' Move the declaration of 'struct trace_array' out of #ifdef CONFIG_TRACING block, to fix the following warning when CONFIG_TRACING is not set: >> include/linux/trace.h:63:45: warning: 'struct trace_array' declared inside parameter list will not be visible outside of this definition or declaration Link: https://lkml.kernel.org/r/20221107160556.2139463-1-shraash@google.com Fixes: 1a77dd1c2bb5 ("scsi: tracing: Fix compile error in trace_array calls when TRACING is disabled") Cc: "Martin K. Petersen" Cc: Arun Easi Acked-by: Masami Hiramatsu (Google) Reviewed-by: Guenter Roeck Signed-off-by: Aashish Sharma Signed-off-by: Steven Rostedt (Google) --- include/linux/trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/trace.h b/include/linux/trace.h index b5e16e438448f..80ffda8717491 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -26,13 +26,13 @@ struct trace_export { int flags; }; +struct trace_array; + #ifdef CONFIG_TRACING int register_ftrace_export(struct trace_export *export); int unregister_ftrace_export(struct trace_export *export); -struct trace_array; - void trace_printk_init_buffers(void); __printf(3, 4) int trace_array_printk(struct trace_array *tr, unsigned long ip, From 08948caebe93482db1adfd2154eba124f66d161d Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Wed, 9 Nov 2022 09:44:32 +0000 Subject: [PATCH 05/13] ftrace: Fix the possible incorrect kernel message If the number of mcount entries is an integer multiple of ENTRIES_PER_PAGE, the page count showing on the console would be wrong. Link: https://lkml.kernel.org/r/20221109094434.84046-2-wangwensheng4@huawei.com Cc: Cc: Cc: stable@vger.kernel.org Fixes: 5821e1b74f0d0 ("function tracing: fix wrong pos computing when read buffer has been fulfilled") Signed-off-by: Wang Wensheng Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7dc023641bf10..8b13ce2eae705 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7391,7 +7391,7 @@ void __init ftrace_init(void) } pr_info("ftrace: allocating %ld entries in %ld pages\n", - count, count / ENTRIES_PER_PAGE + 1); + count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); ret = ftrace_process_locs(NULL, __start_mcount_loc, From bcea02b096333dc74af987cb9685a4dbdd820840 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Wed, 9 Nov 2022 09:44:33 +0000 Subject: [PATCH 06/13] ftrace: Optimize the allocation for mcount entries If we can't allocate this size, try something smaller with half of the size. Its order should be decreased by one instead of divided by two. Link: https://lkml.kernel.org/r/20221109094434.84046-3-wangwensheng4@huawei.com Cc: Cc: Cc: stable@vger.kernel.org Fixes: a79008755497d ("ftrace: Allocate the mcount record pages as groups") Signed-off-by: Wang Wensheng Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b13ce2eae705..56a168121bfc0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3190,7 +3190,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) /* if we can't allocate this size, try something smaller */ if (!order) return -ENOMEM; - order >>= 1; + order--; goto again; } From 56f4ca0a79a9f1af98f26c54b9b89ba1f9bcc6bd Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Mon, 14 Nov 2022 17:31:29 +0300 Subject: [PATCH 07/13] ring_buffer: Do not deactivate non-existant pages rb_head_page_deactivate() expects cpu_buffer to contain a valid list of ->pages, so verify that the list is actually present before calling it. Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Link: https://lkml.kernel.org/r/20221114143129.3534443-1-d-tatianin@yandex-team.ru Cc: stable@vger.kernel.org Fixes: 77ae365eca895 ("ring-buffer: make lockless") Signed-off-by: Daniil Tatianin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a19369c4d8df3..b21bf14bae9bd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1802,9 +1802,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(cpu_buffer->reader_page); - rb_head_page_deactivate(cpu_buffer); - if (head) { + rb_head_page_deactivate(cpu_buffer); + list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); free_buffer_page(bpage); From 19ba6c8af9382c4c05dc6a0a79af3013b9a35cd0 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 16 Nov 2022 09:52:07 +0800 Subject: [PATCH 08/13] ftrace: Fix null pointer dereference in ftrace_add_mod() The @ftrace_mod is allocated by kzalloc(), so both the members {prev,next} of @ftrace_mode->list are NULL, it's not a valid state to call list_del(). If kstrdup() for @ftrace_mod->{func|module} fails, it goes to @out_free tag and calls free_ftrace_mod() to destroy @ftrace_mod, then list_del() will write prev->next and next->prev, where null pointer dereference happens. BUG: kernel NULL pointer dereference, address: 0000000000000008 Oops: 0002 [#1] PREEMPT SMP NOPTI Call Trace: ftrace_mod_callback+0x20d/0x220 ? do_filp_open+0xd9/0x140 ftrace_process_regex.isra.51+0xbf/0x130 ftrace_regex_write.isra.52.part.53+0x6e/0x90 vfs_write+0xee/0x3a0 ? __audit_filter_op+0xb1/0x100 ? auditd_test_task+0x38/0x50 ksys_write+0xa5/0xe0 do_syscall_64+0x3a/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Kernel panic - not syncing: Fatal exception So call INIT_LIST_HEAD() to initialize the list member to fix this issue. Link: https://lkml.kernel.org/r/20221116015207.30858-1-xiujianfeng@huawei.com Cc: stable@vger.kernel.org Fixes: 673feb9d76ab ("ftrace: Add :mod: caching infrastructure to trace_array") Signed-off-by: Xiu Jianfeng Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 56a168121bfc0..33236241f2364 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1289,6 +1289,7 @@ static int ftrace_add_mod(struct trace_array *tr, if (!ftrace_mod) return -ENOMEM; + INIT_LIST_HEAD(&ftrace_mod->list); ftrace_mod->func = kstrdup(func, GFP_KERNEL); ftrace_mod->module = kstrdup(module, GFP_KERNEL); ftrace_mod->enable = enable; From a4527fef9afe5c903c718d0cd24609fe9c754250 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 17 Nov 2022 09:23:45 +0800 Subject: [PATCH 09/13] tracing: Fix memory leak in test_gen_synth_cmd() and test_empty_synth_event() test_gen_synth_cmd() only free buf in fail path, hence buf will leak when there is no failure. Add kfree(buf) to prevent the memleak. The same reason and solution in test_empty_synth_event(). unreferenced object 0xffff8881127de000 (size 2048): comm "modprobe", pid 247, jiffies 4294972316 (age 78.756s) hex dump (first 32 bytes): 20 67 65 6e 5f 73 79 6e 74 68 5f 74 65 73 74 20 gen_synth_test 20 70 69 64 5f 74 20 6e 65 78 74 5f 70 69 64 5f pid_t next_pid_ backtrace: [<000000004254801a>] kmalloc_trace+0x26/0x100 [<0000000039eb1cf5>] 0xffffffffa00083cd [<000000000e8c3bc8>] 0xffffffffa00086ba [<00000000c293d1ea>] do_one_initcall+0xdb/0x480 [<00000000aa189e6d>] do_init_module+0x1cf/0x680 [<00000000d513222b>] load_module+0x6a50/0x70a0 [<000000001fd4d529>] __do_sys_finit_module+0x12f/0x1c0 [<00000000b36c4c0f>] do_syscall_64+0x3f/0x90 [<00000000bbf20cf3>] entry_SYSCALL_64_after_hwframe+0x63/0xcd unreferenced object 0xffff8881127df000 (size 2048): comm "modprobe", pid 247, jiffies 4294972324 (age 78.728s) hex dump (first 32 bytes): 20 65 6d 70 74 79 5f 73 79 6e 74 68 5f 74 65 73 empty_synth_tes 74 20 20 70 69 64 5f 74 20 6e 65 78 74 5f 70 69 t pid_t next_pi backtrace: [<000000004254801a>] kmalloc_trace+0x26/0x100 [<00000000d4db9a3d>] 0xffffffffa0008071 [<00000000c31354a5>] 0xffffffffa00086ce [<00000000c293d1ea>] do_one_initcall+0xdb/0x480 [<00000000aa189e6d>] do_init_module+0x1cf/0x680 [<00000000d513222b>] load_module+0x6a50/0x70a0 [<000000001fd4d529>] __do_sys_finit_module+0x12f/0x1c0 [<00000000b36c4c0f>] do_syscall_64+0x3f/0x90 [<00000000bbf20cf3>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Link: https://lkml.kernel.org/r/20221117012346.22647-2-shangxiaojing@huawei.com Cc: Cc: Cc: Cc: stable@vger.kernel.org Fixes: 9fe41efaca08 ("tracing: Add synth event generation test module") Signed-off-by: Shang XiaoJing Signed-off-by: Steven Rostedt (Google) --- kernel/trace/synth_event_gen_test.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 0b15e975d2c2c..8d77526892f45 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -120,15 +120,13 @@ static int __init test_gen_synth_cmd(void) /* Now generate a gen_synth_test event */ ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals)); - out: + free: + kfree(buf); return ret; delete: /* We got an error after creating the event, delete it */ synth_event_delete("gen_synth_test"); - free: - kfree(buf); - - goto out; + goto free; } /* @@ -227,15 +225,13 @@ static int __init test_empty_synth_event(void) /* Now trace an empty_synth_test event */ ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals)); - out: + free: + kfree(buf); return ret; delete: /* We got an error after creating the event, delete it */ synth_event_delete("empty_synth_test"); - free: - kfree(buf); - - goto out; + goto free; } static struct synth_field_desc create_synth_test_fields[] = { From 1b5f1c34d3f5a664a57a5a7557a50e4e3cc2505c Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 17 Nov 2022 09:23:46 +0800 Subject: [PATCH 10/13] tracing: Fix wild-memory-access in register_synth_event() In register_synth_event(), if set_synth_event_print_fmt() failed, then both trace_remove_event_call() and unregister_trace_event() will be called, which means the trace_event_call will call __unregister_trace_event() twice. As the result, the second unregister will causes the wild-memory-access. register_synth_event set_synth_event_print_fmt failed trace_remove_event_call event_remove if call->event.funcs then __unregister_trace_event (first call) unregister_trace_event __unregister_trace_event (second call) Fix the bug by avoiding to call the second __unregister_trace_event() by checking if the first one is called. general protection fault, probably for non-canonical address 0xfbd59c0000000024: 0000 [#1] SMP KASAN PTI KASAN: maybe wild-memory-access in range [0xdead000000000120-0xdead000000000127] CPU: 0 PID: 3807 Comm: modprobe Not tainted 6.1.0-rc1-00186-g76f33a7eedb4 #299 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:unregister_trace_event+0x6e/0x280 Code: 00 fc ff df 4c 89 ea 48 c1 ea 03 80 3c 02 00 0f 85 0e 02 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b 63 08 4c 89 e2 48 c1 ea 03 <80> 3c 02 00 0f 85 e2 01 00 00 49 89 2c 24 48 85 ed 74 28 e8 7a 9b RSP: 0018:ffff88810413f370 EFLAGS: 00010a06 RAX: dffffc0000000000 RBX: ffff888105d050b0 RCX: 0000000000000000 RDX: 1bd5a00000000024 RSI: ffff888119e276e0 RDI: ffffffff835a8b20 RBP: dead000000000100 R08: 0000000000000000 R09: fffffbfff0913481 R10: ffffffff8489a407 R11: fffffbfff0913480 R12: dead000000000122 R13: ffff888105d050b8 R14: 0000000000000000 R15: ffff888105d05028 FS: 00007f7823e8d540(0000) GS:ffff888119e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7823e7ebec CR3: 000000010a058002 CR4: 0000000000330ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __create_synth_event+0x1e37/0x1eb0 create_or_delete_synth_event+0x110/0x250 synth_event_run_command+0x2f/0x110 test_gen_synth_cmd+0x170/0x2eb [synth_event_gen_test] synth_event_gen_test_init+0x76/0x9bc [synth_event_gen_test] do_one_initcall+0xdb/0x480 do_init_module+0x1cf/0x680 load_module+0x6a50/0x70a0 __do_sys_finit_module+0x12f/0x1c0 do_syscall_64+0x3f/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Link: https://lkml.kernel.org/r/20221117012346.22647-3-shangxiaojing@huawei.com Fixes: 4b147936fa50 ("tracing: Add support for 'synthetic' events") Signed-off-by: Shang XiaoJing Cc: stable@vger.kernel.org Cc: Cc: Cc: Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index e310052dc83ce..29fbfb27c2b2c 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -828,10 +828,9 @@ static int register_synth_event(struct synth_event *event) } ret = set_synth_event_print_fmt(call); - if (ret < 0) { + /* unregister_trace_event() will be called inside */ + if (ret < 0) trace_remove_event_call(call); - goto err; - } out: return ret; err: From b8752064e30697e3982418f4274cc63cfc6f3027 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Fri, 18 Nov 2022 00:44:35 +0800 Subject: [PATCH 11/13] tracing: Remove unused __bad_type_size() method __bad_type_size() is unused after commit 04ae87a52074("ftrace: Rework event_create_dir()"). So, remove it. Link: https://lkml.kernel.org/r/D062EC2E-7DB7-4402-A67E-33C3577F551E@gmail.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b69e207012c99..942ddbdace4a4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -201,8 +201,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags, return trace_handle_return(s); } -extern char *__bad_type_size(void); - #define SYSCALL_FIELD(_type, _name) { \ .type = #_type, .name = #_name, \ .size = sizeof(_type), .align = __alignof__(_type), \ From 067df9e0ad48a97382ab2179bbe773a13a846028 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Mon, 14 Nov 2022 18:46:32 +0800 Subject: [PATCH 12/13] tracing: Fix potential null-pointer-access of entry in list 'tr->err_log' Entries in list 'tr->err_log' will be reused after entry number exceed TRACING_LOG_ERRS_MAX. The cmd string of the to be reused entry will be freed first then allocated a new one. If the allocation failed, then the entry will still be in list 'tr->err_log' but its 'cmd' field is set to be NULL, later access of 'cmd' is risky. Currently above problem can cause the loss of 'cmd' information of first entry in 'tr->err_log'. When execute `cat /sys/kernel/tracing/error_log`, reproduce logs like: [ 37.495100] trace_kprobe: error: Maxactive is not for kprobe(null) ^ [ 38.412517] trace_kprobe: error: Maxactive is not for kprobe Command: p4:myprobe2 do_sys_openat2 ^ Link: https://lore.kernel.org/linux-trace-kernel/20221114104632.3547266-1-zhengyejian1@huawei.com Fixes: 1581a884b7ca ("tracing: Remove size restriction on tracing_log_err cmd strings") Signed-off-by: Zheng Yejian Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5bd202d6d79a1..a7fe0e115272e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7803,6 +7803,7 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, int len) { struct tracing_log_err *err; + char *cmd; if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { err = alloc_tracing_log_err(len); @@ -7811,12 +7812,12 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, return err; } - + cmd = kzalloc(len, GFP_KERNEL); + if (!cmd) + return ERR_PTR(-ENOMEM); err = list_first_entry(&tr->err_log, struct tracing_log_err, list); kfree(err->cmd); - err->cmd = kzalloc(len, GFP_KERNEL); - if (!err->cmd) - return ERR_PTR(-ENOMEM); + err->cmd = cmd; list_del(&err->list); return err; From 94eedf3dded5fb472ce97bfaf3ac1c6c29c35d26 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 17 Nov 2022 21:42:49 -0500 Subject: [PATCH 13/13] tracing: Fix race where eprobes can be called before the event The flag that tells the event to call its triggers after reading the event is set for eprobes after the eprobe is enabled. This leads to a race where the eprobe may be triggered at the beginning of the event where the record information is NULL. The eprobe then dereferences the NULL record causing a NULL kernel pointer bug. Test for a NULL record to keep this from happening. Link: https://lore.kernel.org/linux-trace-kernel/20221116192552.1066630-1-rafaelmendsr@gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20221117214249.2addbe10@gandalf.local.home Cc: Linux Trace Kernel Cc: Tzvetomir Stoyanov Cc: Tom Zanussi Cc: stable@vger.kernel.org Fixes: 7491e2c442781 ("tracing: Add a probe that attaches to trace events") Acked-by: Masami Hiramatsu (Google) Reported-by: Rafael Mendonca Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 5dd0617e5df6d..9cda9a38422cc 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -563,6 +563,9 @@ static void eprobe_trigger_func(struct event_trigger_data *data, { struct eprobe_data *edata = data->private_data; + if (unlikely(!rec)) + return; + __eprobe_trace_func(edata, rec); }