From c73f0b69648501978e8b3e8fa7eef7f4197d0481 Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Sun, 23 Feb 2025 15:01:06 +0800 Subject: [PATCH 01/17] ring-buffer: Fix bytes_dropped calculation issue The calculation of bytes-dropped and bytes_dropped_nested is reversed. Although it does not affect the final calculation of total_dropped, it should still be modified. Link: https://lore.kernel.org/20250223070106.6781-1-yangfeng59949@163.com Fixes: 6c43e554a2a5 ("ring-buffer: Add ring buffer startup selftest") Signed-off-by: Feng Yang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bb6089c2951e..510409f97992 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7411,9 +7411,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) /* Ignore dropped events before test starts. */ if (started) { if (nested) - data->bytes_dropped += len; - else data->bytes_dropped_nested += len; + else + data->bytes_dropped += len; } return len; } From bcba8d4dbe6880ce9883409df486de35d3946704 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:40 -0500 Subject: [PATCH 02/17] ring-buffer: Use kaslr address instead of text delta Instead of saving off the text and data pointers and using them to compare with the current boot's text and data pointers, just save off the KASLR offset. Then that can be used to figure out how to read the previous boots buffer. The last_boot_info will now show this offset, but only if it is for a previous boot: ~# cat instances/boot_mapped/last_boot_info 39000000 [kernel] ~# echo function > instances/boot_mapped/current_tracer ~# cat instances/boot_mapped/last_boot_info # Current If the KASLR offset saved is for the current boot, the last_boot_info will show the value of "current". Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164608.274956504@goodmis.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 3 +-- kernel/trace/ring_buffer.c | 31 ++++++++++++------------------- kernel/trace/trace.c | 30 +++++++++++++++++++++--------- kernel/trace/trace.h | 9 +++++---- 4 files changed, 39 insertions(+), 34 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 17fbb7855295..8de035f4f0d9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -94,8 +94,7 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag unsigned long range_size, struct lock_class_key *key); -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, - long *data); +bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kaslr_addr); /* * Because the ring buffer is generic, if other users of the ring buffer get diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 510409f97992..9aa2e818eced 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -31,6 +31,7 @@ #include #include +#include #include "trace.h" @@ -49,8 +50,7 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { int magic; int struct_size; - unsigned long text_addr; - unsigned long data_addr; + unsigned long kaslr_addr; unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; @@ -550,8 +550,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; - long last_text_delta; - long last_data_delta; + unsigned long kaslr_addr; unsigned int subbuf_size; unsigned int subbuf_order; @@ -1891,16 +1890,13 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) } } -/* Used to calculate data delta */ -static char rb_data_ptr[] = ""; - -#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) -#define THIS_DATA_PTR ((unsigned long)rb_data_ptr) - static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) { - meta->text_addr = THIS_TEXT_PTR; - meta->data_addr = THIS_DATA_PTR; +#ifdef CONFIG_RANDOMIZE_BASE + meta->kaslr_addr = kaslr_offset(); +#else + meta->kaslr_addr = 0; +#endif } static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) @@ -1928,8 +1924,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; - buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; - buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; + buffer->kaslr_addr = meta->kaslr_addr; continue; } @@ -2482,17 +2477,15 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag * * Returns: The true if the delta is non zero */ -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, - long *data) +bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kaslr_addr) { if (!buffer) return false; - if (!buffer->last_text_delta) + if (!buffer->kaslr_addr) return false; - *text = buffer->last_text_delta; - *data = buffer->last_data_delta; + *kaslr_addr = buffer->kaslr_addr; return true; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e6d517e74e0..934658cda570 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -50,7 +50,7 @@ #include #include -#include /* COMMAND_LINE_SIZE */ +#include /* COMMAND_LINE_SIZE and kaslr_offset() */ #include "trace.h" #include "trace_output.h" @@ -4193,7 +4193,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) * safe to use if the array has delta offsets * Force printing via the fields. */ - if ((tr->text_delta || tr->data_delta) && + if ((tr->text_delta) && event->type > __TRACE_LAST_TYPE) return print_event_fields(iter, event); @@ -5990,7 +5990,7 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, static void update_last_data(struct trace_array *tr) { - if (!tr->text_delta && !tr->data_delta) + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return; /* @@ -6003,7 +6003,8 @@ static void update_last_data(struct trace_array *tr) /* Using current data now */ tr->text_delta = 0; - tr->data_delta = 0; + + tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; } /** @@ -6821,8 +6822,17 @@ tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t seq_buf_init(&seq, buf, 64); - seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta); - seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta); + /* + * Do not leak KASLR address. This only shows the KASLR address of + * the last boot. When the ring buffer is started, the LAST_BOOT + * flag gets cleared, and this should only report "current". + * Otherwise it shows the KASLR address from the previous boot which + * should not be the same as the current boot. + */ + if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT) + seq_buf_printf(&seq, "%lx\t[kernel]\n", tr->kaslr_addr); + else + seq_buf_puts(&seq, "# Current\n"); return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq)); } @@ -9210,8 +9220,10 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size tr->range_addr_start, tr->range_addr_size); - ring_buffer_last_boot_delta(buf->buffer, - &tr->text_delta, &tr->data_delta); +#ifdef CONFIG_RANDOMIZE_BASE + if (ring_buffer_last_boot_delta(buf->buffer, &tr->kaslr_addr)) + tr->text_delta = kaslr_offset() - tr->kaslr_addr; +#endif /* * This is basically the same as a mapped buffer, * with the same restrictions. @@ -10459,7 +10471,7 @@ __init static void enable_instances(void) * to it. */ if (start) { - tr->flags |= TRACE_ARRAY_FL_BOOT; + tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; tr->ref++; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9c21ba45b7af..abe8169c3e87 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -348,8 +348,8 @@ struct trace_array { unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; + unsigned long kaslr_addr; long text_delta; - long data_delta; struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; @@ -433,9 +433,10 @@ struct trace_array { }; enum { - TRACE_ARRAY_FL_GLOBAL = BIT(0), - TRACE_ARRAY_FL_BOOT = BIT(1), - TRACE_ARRAY_FL_MOD_INIT = BIT(2), + TRACE_ARRAY_FL_GLOBAL = BIT(0), + TRACE_ARRAY_FL_BOOT = BIT(1), + TRACE_ARRAY_FL_LAST_BOOT = BIT(2), + TRACE_ARRAY_FL_MOD_INIT = BIT(3), }; #ifdef CONFIG_MODULES From 4009cc31e7813ed66a04237ddff76706ff57a771 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:41 -0500 Subject: [PATCH 03/17] ring-buffer: Add buffer meta data for persistent ring buffer Instead of just having a meta data at the first page of each sub buffer that has duplicate data, add a new meta page to the entire block of memory that holds the duplicate data and remove it from the sub buffer meta data. This will open up the extra memory in this first page to be used by the tracer for its own persistent data. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164608.446351513@goodmis.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 171 ++++++++++++++++++++++++++----------- 1 file changed, 119 insertions(+), 52 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9aa2e818eced..0304e431604d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -49,7 +49,12 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { int magic; - int struct_size; + int struct_sizes; + unsigned long total_size; + unsigned long buffers_offset; +}; + +struct ring_buffer_cpu_meta { unsigned long kaslr_addr; unsigned long first_buffer; unsigned long head_buffer; @@ -517,7 +522,7 @@ struct ring_buffer_per_cpu { struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; - struct ring_buffer_meta *ring_meta; + struct ring_buffer_cpu_meta *ring_meta; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -550,6 +555,8 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + struct ring_buffer_meta *meta; + unsigned long kaslr_addr; unsigned int subbuf_size; @@ -1270,7 +1277,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) rb_set_list_to_head(head->list.prev); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->head_buffer = (unsigned long)head->page; } } @@ -1568,7 +1575,7 @@ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) static unsigned long rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) { - addr += sizeof(struct ring_buffer_meta) + + addr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_subbufs; return ALIGN(addr, subbuf_size); } @@ -1579,19 +1586,22 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; - unsigned long ptr = buffer->range_addr_start; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; + struct ring_buffer_meta *bmeta; + unsigned long ptr; int nr_subbufs; - if (!ptr) + bmeta = buffer->meta; + if (!bmeta) return NULL; + ptr = (unsigned long)bmeta + bmeta->buffers_offset; + meta = (struct ring_buffer_cpu_meta *)ptr; + /* When nr_pages passed in is zero, the first meta has already been initialized */ if (!nr_pages) { - meta = (struct ring_buffer_meta *)ptr; nr_subbufs = meta->nr_subbufs; } else { - meta = NULL; /* Include the reader page */ nr_subbufs = nr_pages + 1; } @@ -1623,7 +1633,7 @@ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) } /* Return the start of subbufs given the meta pointer */ -static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) +static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta) { int subbuf_size = meta->subbuf_size; unsigned long ptr; @@ -1639,7 +1649,7 @@ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) */ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long ptr; int subbuf_size; @@ -1664,15 +1674,74 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) return (void *)ptr; } +/* + * See if the existing memory contains a valid meta section. + * if so, use that, otherwise initialize it. + */ +static bool rb_meta_init(struct trace_buffer *buffer) +{ + unsigned long ptr = buffer->range_addr_start; + struct ring_buffer_meta *bmeta; + unsigned long total_size; + int struct_sizes; + + bmeta = (struct ring_buffer_meta *)ptr; + buffer->meta = bmeta; + + total_size = buffer->range_addr_end - buffer->range_addr_start; + + struct_sizes = sizeof(struct ring_buffer_cpu_meta); + struct_sizes |= sizeof(*bmeta) << 16; + + /* The first buffer will start word size after the meta page */ + ptr += sizeof(*bmeta); + ptr = ALIGN(ptr, sizeof(long)); + + if (bmeta->magic != RING_BUFFER_META_MAGIC) { + pr_info("Ring buffer boot meta mismatch of magic\n"); + goto init; + } + + if (bmeta->struct_sizes != struct_sizes) { + pr_info("Ring buffer boot meta mismatch of struct size\n"); + goto init; + } + + if (bmeta->total_size != total_size) { + pr_info("Ring buffer boot meta mismatch of total size\n"); + goto init; + } + + if (bmeta->buffers_offset > bmeta->total_size) { + pr_info("Ring buffer boot meta mismatch of offset outside of total size\n"); + goto init; + } + + if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) { + pr_info("Ring buffer boot meta mismatch of first buffer offset\n"); + goto init; + } + + return true; + + init: + bmeta->magic = RING_BUFFER_META_MAGIC; + bmeta->struct_sizes = struct_sizes; + bmeta->total_size = total_size; + bmeta->buffers_offset = (void *)ptr - (void *)bmeta; + + return false; +} + /* * See if the existing memory contains valid ring buffer data. * As the previous kernel must be the same as this kernel, all * the calculations (size of buffers and number of buffers) * must be the same. */ -static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages, - unsigned long *subbuf_mask) +static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1683,20 +1752,6 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, if (!subbuf_mask) return false; - /* Check the meta magic and meta struct size */ - if (meta->magic != RING_BUFFER_META_MAGIC || - meta->struct_size != sizeof(*meta)) { - pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); - return false; - } - - /* The subbuffer's size and number of subbuffers must match */ - if (meta->subbuf_size != subbuf_size || - meta->nr_subbufs != nr_pages + 1) { - pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); - return false; - } - buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); @@ -1742,7 +1797,7 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return true; } -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf); static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, unsigned long long *timestamp, u64 *delta_ptr) @@ -1809,7 +1864,7 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) /* If the meta data has been validated, now validate the events */ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; struct buffer_page *head_page; unsigned long entry_bytes = 0; unsigned long entries = 0; @@ -1890,7 +1945,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) } } -static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) +static void rb_meta_init_text_addr(struct ring_buffer_cpu_meta *meta) { #ifdef CONFIG_RANDOMIZE_BASE meta->kaslr_addr = kaslr_offset(); @@ -1901,10 +1956,12 @@ static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; + struct ring_buffer_meta *bmeta; unsigned long *subbuf_mask; unsigned long delta; void *subbuf; + bool valid = false; int cpu; int i; @@ -1912,12 +1969,17 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + if (rb_meta_init(buffer)) + valid = true; + + bmeta = buffer->meta; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { + if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; @@ -1935,9 +1997,6 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) memset(meta, 0, next_meta - (void *)meta); - meta->magic = RING_BUFFER_META_MAGIC; - meta->struct_size = sizeof(*meta); - meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; @@ -1966,7 +2025,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) static void *rbm_start(struct seq_file *m, loff_t *pos) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val; if (!meta) @@ -1991,7 +2050,7 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; if (val == 1) { @@ -2040,7 +2099,7 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; if (meta->head_buffer == (unsigned long)bpage->page) cpu_buffer->head_page = bpage; @@ -2055,7 +2114,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; - struct ring_buffer_meta *meta = NULL; + struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; @@ -2179,7 +2238,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; struct page *page; int ret; @@ -2350,10 +2409,18 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, /* If start/end are specified, then that overrides size */ if (start && end) { + unsigned long buffers_start; unsigned long ptr; int n; - size = end - start; + /* Make sure that start is word aligned */ + start = ALIGN(start, sizeof(long)); + + /* Subtract the buffer meta data and word aligned */ + buffers_start = start + sizeof(struct ring_buffer_cpu_meta); + buffers_start = ALIGN(buffers_start, sizeof(long)); + + size = end - buffers_start; size = size / nr_cpu_ids; /* @@ -2363,7 +2430,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, * needed, plus account for the integer array index that * will be appended to the meta data. */ - nr_pages = (size - sizeof(struct ring_buffer_meta)) / + nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) / (subbuf_size + sizeof(int)); /* Need at least two pages plus the reader page */ if (nr_pages < 3) @@ -2371,8 +2438,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, again: /* Make sure that the size fits aligned */ - for (n = 0, ptr = start; n < nr_cpu_ids; n++) { - ptr += sizeof(struct ring_buffer_meta) + + for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) { + ptr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_pages; ptr = ALIGN(ptr, subbuf_size); ptr += subbuf_size * nr_pages; @@ -3098,7 +3165,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) } /* Return the index into the sub-buffers for a given sub-buffer */ -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf) { void *subbuf_array; @@ -3110,7 +3177,7 @@ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long old_head = (unsigned long)next_page->page; unsigned long new_head; @@ -3127,7 +3194,7 @@ static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *reader) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; void *old_reader = cpu_buffer->reader_page->page; void *new_reader = reader->page; int id; @@ -3756,7 +3823,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; } /* add barrier to keep gcc from optimizing too much */ @@ -6009,7 +6076,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->mapped) { rb_update_meta_page(cpu_buffer); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = meta->head_buffer; } } @@ -6043,7 +6110,7 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -6081,7 +6148,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ From 4af0a9c518522892b36cb7ecedf0c6004dc0a581 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:42 -0500 Subject: [PATCH 04/17] ring-buffer: Add ring_buffer_meta_scratch() Now that there's one meta data at the start of the persistent memory used by the ring buffer, allow the caller to request some memory right after that data that it can use as its own persistent memory. Also fix some white space issues with ring_buffer_alloc(). Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164608.619631731@goodmis.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 6 ++++-- kernel/trace/ring_buffer.c | 35 +++++++++++++++++++++++++++++------ kernel/trace/trace.c | 2 +- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 8de035f4f0d9..019b59a0bbc9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -92,9 +92,11 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, + unsigned long scratch_size, struct lock_class_key *key); bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kaslr_addr); +void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size); /* * Because the ring buffer is generic, if other users of the ring buffer get @@ -112,11 +114,11 @@ bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kas * traced by ftrace, it can produce lockdep warnings. We need to keep each * ring buffer's lock class separate. */ -#define ring_buffer_alloc_range(size, flags, order, start, range_size) \ +#define ring_buffer_alloc_range(size, flags, order, start, range_size, s_size) \ ({ \ static struct lock_class_key __key; \ __ring_buffer_alloc_range((size), (flags), (order), (start), \ - (range_size), &__key); \ + (range_size), (s_size), &__key); \ }) typedef bool (*ring_buffer_cond_fn)(void *data); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0304e431604d..c511b3283dc6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1678,7 +1678,7 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) * See if the existing memory contains a valid meta section. * if so, use that, otherwise initialize it. */ -static bool rb_meta_init(struct trace_buffer *buffer) +static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) { unsigned long ptr = buffer->range_addr_start; struct ring_buffer_meta *bmeta; @@ -1696,6 +1696,7 @@ static bool rb_meta_init(struct trace_buffer *buffer) /* The first buffer will start word size after the meta page */ ptr += sizeof(*bmeta); ptr = ALIGN(ptr, sizeof(long)); + ptr += scratch_size; if (bmeta->magic != RING_BUFFER_META_MAGIC) { pr_info("Ring buffer boot meta mismatch of magic\n"); @@ -1730,6 +1731,9 @@ static bool rb_meta_init(struct trace_buffer *buffer) bmeta->total_size = total_size; bmeta->buffers_offset = (void *)ptr - (void *)bmeta; + /* Zero out the scatch pad */ + memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); + return false; } @@ -1954,7 +1958,7 @@ static void rb_meta_init_text_addr(struct ring_buffer_cpu_meta *meta) #endif } -static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) +static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { struct ring_buffer_cpu_meta *meta; struct ring_buffer_meta *bmeta; @@ -1969,7 +1973,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ - if (rb_meta_init(buffer)) + if (rb_meta_init(buffer, scratch_size)) valid = true; bmeta = buffer->meta; @@ -2367,6 +2371,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, + unsigned long scratch_size, struct lock_class_key *key) { struct trace_buffer *buffer; @@ -2416,10 +2421,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, /* Make sure that start is word aligned */ start = ALIGN(start, sizeof(long)); + /* scratch_size needs to be aligned too */ + scratch_size = ALIGN(scratch_size, sizeof(long)); + /* Subtract the buffer meta data and word aligned */ buffers_start = start + sizeof(struct ring_buffer_cpu_meta); buffers_start = ALIGN(buffers_start, sizeof(long)); + buffers_start += scratch_size; + /* Calculate the size for the per CPU data */ size = end - buffers_start; size = size / nr_cpu_ids; @@ -2456,7 +2466,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_start = start; buffer->range_addr_end = end; - rb_range_meta_init(buffer, nr_pages); + rb_range_meta_init(buffer, nr_pages, scratch_size); } else { /* need at least two pages */ @@ -2509,7 +2519,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0,key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2521,6 +2531,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range + * @scratch_size: size of scratch area (for preallocated memory buffers) * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE @@ -2531,9 +2542,11 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, + unsigned long scratch_size, struct lock_class_key *key) { - return alloc_buffer(size, flags, order, start, start + range_size, key); + return alloc_buffer(size, flags, order, start, start + range_size, + scratch_size, key); } /** @@ -2557,6 +2570,16 @@ bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kas return true; } +void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) +{ + if (!buffer || !buffer->meta) + return NULL; + + *size = PAGE_SIZE - sizeof(*buffer->meta); + + return (void *)buffer->meta + sizeof(*buffer->meta); +} + /** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 934658cda570..73c8df12c33c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9218,7 +9218,7 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size if (tr->range_addr_start && tr->range_addr_size) { buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, - tr->range_addr_size); + tr->range_addr_size, 0); #ifdef CONFIG_RANDOMIZE_BASE if (ring_buffer_last_boot_delta(buf->buffer, &tr->kaslr_addr)) From b65334825fb14cae15e93d627c3cfc2986c7eea6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:43 -0500 Subject: [PATCH 05/17] tracing: Have persistent trace instances save KASLR offset There's no reason to save the KASLR offset for the ring buffer itself. That is used by the tracer. Now that the tracer has a way to save data in the persistent memory of the ring buffer, have the tracing infrastructure take care of the saving of the KASLR offset. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164608.792722274@goodmis.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 - kernel/trace/ring_buffer.c | 59 +++++++------------------------------ kernel/trace/trace.c | 37 +++++++++++++++++++---- kernel/trace/trace.h | 6 ++-- 4 files changed, 46 insertions(+), 57 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 019b59a0bbc9..56e27263acf8 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -95,7 +95,6 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag unsigned long scratch_size, struct lock_class_key *key); -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kaslr_addr); void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size); /* diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c511b3283dc6..b981eb20c206 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -55,7 +55,6 @@ struct ring_buffer_meta { }; struct ring_buffer_cpu_meta { - unsigned long kaslr_addr; unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; @@ -557,8 +556,6 @@ struct trace_buffer { struct ring_buffer_meta *meta; - unsigned long kaslr_addr; - unsigned int subbuf_size; unsigned int subbuf_order; unsigned int max_data_size; @@ -1949,15 +1946,6 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) } } -static void rb_meta_init_text_addr(struct ring_buffer_cpu_meta *meta) -{ -#ifdef CONFIG_RANDOMIZE_BASE - meta->kaslr_addr = kaslr_offset(); -#else - meta->kaslr_addr = 0; -#endif -} - static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { struct ring_buffer_cpu_meta *meta; @@ -1990,7 +1978,6 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int sc meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; - buffer->kaslr_addr = meta->kaslr_addr; continue; } @@ -2007,7 +1994,6 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int sc subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; - rb_meta_init_text_addr(meta); /* * The buffers[] array holds the order of the sub-buffers @@ -2549,35 +2535,22 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag scratch_size, key); } -/** - * ring_buffer_last_boot_delta - return the delta offset from last boot - * @buffer: The buffer to return the delta from - * @text: Return text delta - * @data: Return data delta - * - * Returns: The true if the delta is non zero - */ -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kaslr_addr) -{ - if (!buffer) - return false; - - if (!buffer->kaslr_addr) - return false; - - *kaslr_addr = buffer->kaslr_addr; - - return true; -} - void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) { + struct ring_buffer_meta *meta; + void *ptr; + if (!buffer || !buffer->meta) return NULL; - *size = PAGE_SIZE - sizeof(*buffer->meta); + meta = buffer->meta; - return (void *)buffer->meta + sizeof(*buffer->meta); + ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long)); + + if (size) + *size = (void *)meta + meta->buffers_offset - ptr; + + return ptr; } /** @@ -6133,7 +6106,6 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_cpu_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -6152,11 +6124,6 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -6171,7 +6138,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_cpu_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ @@ -6199,11 +6165,6 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) reset_disabled_cpu_buffer(cpu_buffer); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73c8df12c33c..d4df56657b9d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5988,8 +5988,14 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return __tracing_resize_ring_buffer(tr, size, cpu_id); } +struct trace_scratch { + unsigned long kaslr_addr; +}; + static void update_last_data(struct trace_array *tr) { + struct trace_scratch *tscratch; + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return; @@ -6004,6 +6010,17 @@ static void update_last_data(struct trace_array *tr) /* Using current data now */ tr->text_delta = 0; + if (!tr->scratch) + return; + + tscratch = tr->scratch; + + /* Set the persistent ring buffer meta data to this address */ +#ifdef CONFIG_RANDOMIZE_BASE + tscratch->kaslr_addr = kaslr_offset(); +#else + tscratch->kaslr_addr = 0; +#endif tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; } @@ -6817,6 +6834,7 @@ static ssize_t tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; + struct trace_scratch *tscratch = tr->scratch; struct seq_buf seq; char buf[64]; @@ -6829,8 +6847,8 @@ tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t * Otherwise it shows the KASLR address from the previous boot which * should not be the same as the current boot. */ - if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT) - seq_buf_printf(&seq, "%lx\t[kernel]\n", tr->kaslr_addr); + if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + seq_buf_printf(&seq, "%lx\t[kernel]\n", tscratch->kaslr_addr); else seq_buf_puts(&seq, "# Current\n"); @@ -9210,6 +9228,8 @@ static int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; + struct trace_scratch *tscratch; + unsigned int scratch_size; rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; @@ -9218,12 +9238,19 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size if (tr->range_addr_start && tr->range_addr_size) { buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, - tr->range_addr_size, 0); + tr->range_addr_size, + sizeof(*tscratch)); + + tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); + if (tscratch) { + tr->scratch = tscratch; + tr->scratch_size = scratch_size; #ifdef CONFIG_RANDOMIZE_BASE - if (ring_buffer_last_boot_delta(buf->buffer, &tr->kaslr_addr)) - tr->text_delta = kaslr_offset() - tr->kaslr_addr; + if (tscratch->kaslr_addr) + tr->text_delta = kaslr_offset() - tscratch->kaslr_addr; #endif + } /* * This is basically the same as a mapped buffer, * with the same restrictions. diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index abe8169c3e87..3a020fb82a34 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -348,8 +348,11 @@ struct trace_array { unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; - unsigned long kaslr_addr; long text_delta; + void *scratch; /* pointer in persistent memory */ + int scratch_size; + + int buffer_disabled; struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; @@ -367,7 +370,6 @@ struct trace_array { * CONFIG_TRACER_MAX_TRACE. */ arch_spinlock_t max_lock; - int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; From 966b7d0e524da03a18ba1b111d0fa0d81e840f77 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:44 -0500 Subject: [PATCH 06/17] module: Add module_for_each_mod() function The tracing system needs a way to save all the currently loaded modules and their addresses into persistent memory so that it can evaluate the addresses on a reboot from a crash. When the persistent memory trace starts, it will load the module addresses and names into the persistent memory. To do so, it will call the module_for_each_mod() function and pass it a function and data structure to get called on each loaded module. Then it can record the memory. This only implements that function. Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Sami Tolvanen Cc: Daniel Gomez Cc: linux-modules@vger.kernel.org Link: https://lore.kernel.org/20250305164608.962615966@goodmis.org Acked-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- include/linux/module.h | 6 ++++++ kernel/module/main.c | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/linux/module.h b/include/linux/module.h index 30e5b19bafa9..9a71dd2cb11f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -782,6 +782,8 @@ static inline void *module_writable_address(struct module *mod, void *loc) return __module_writable_address(mod, loc); } +void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data); + #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -894,6 +896,10 @@ static inline void *module_writable_address(struct module *mod, void *loc) { return loc; } + +static inline void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data) +{ +} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/kernel/module/main.c b/kernel/module/main.c index 1fb9ad289a6f..927a2e0ffd5f 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3809,6 +3809,19 @@ bool is_module_text_address(unsigned long addr) return ret; } +void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data) +{ + struct module *mod; + + guard(rcu)(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (func(mod, data)) + break; + } +} + /** * __module_text_address() - get the module whose code contains an address. * @addr: the address. From fd39e48fe8eddfa84a2dbaf468fde8c2b2679211 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:45 -0500 Subject: [PATCH 07/17] tracing: Have persistent trace instances save module addresses For trace instances that are mapped to persistent memory, have them use the scratch area to save the currently loaded modules. This will allow where the modules have been loaded on the next boot so that their addresses can be deciphered by using where they were loaded previously. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164609.129741650@goodmis.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 98 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d4df56657b9d..a75e03994312 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5988,14 +5988,60 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return __tracing_resize_ring_buffer(tr, size, cpu_id); } +struct trace_mod_entry { + unsigned long mod_addr; + char mod_name[MODULE_NAME_LEN]; +}; + struct trace_scratch { unsigned long kaslr_addr; + unsigned long nr_entries; + struct trace_mod_entry entries[]; }; +static int save_mod(struct module *mod, void *data) +{ + struct trace_array *tr = data; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + unsigned int size; + + tscratch = tr->scratch; + if (!tscratch) + return -1; + size = tr->scratch_size; + + if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size) + return -1; + + entry = &tscratch->entries[tscratch->nr_entries]; + + tscratch->nr_entries++; + + entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base; + strscpy(entry->mod_name, mod->name); + + return 0; +} + static void update_last_data(struct trace_array *tr) { struct trace_scratch *tscratch; + if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) + return; + + /* Reset the module list and reload them */ + if (tr->scratch) { + struct trace_scratch *tscratch = tr->scratch; + + memset(tscratch->entries, 0, + flex_array_size(tscratch, entries, tscratch->nr_entries)); + tscratch->nr_entries = 0; + + module_for_each_mod(save_mod, tr); + } + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return; @@ -9224,6 +9270,46 @@ static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); +static void setup_trace_scratch(struct trace_array *tr, + struct trace_scratch *tscratch, unsigned int size) +{ + struct trace_mod_entry *entry; + + if (!tscratch) + return; + + tr->scratch = tscratch; + tr->scratch_size = size; + +#ifdef CONFIG_RANDOMIZE_BASE + if (tscratch->kaslr_addr) + tr->text_delta = kaslr_offset() - tscratch->kaslr_addr; +#endif + + if (struct_size(tscratch, entries, tscratch->nr_entries) > size) + goto reset; + + /* Check if each module name is a valid string */ + for (int i = 0; i < tscratch->nr_entries; i++) { + int n; + + entry = &tscratch->entries[i]; + + for (n = 0; n < MODULE_NAME_LEN; n++) { + if (entry->mod_name[n] == '\0') + break; + if (!isprint(entry->mod_name[n])) + goto reset; + } + if (n == MODULE_NAME_LEN) + goto reset; + } + return; + reset: + /* Invalid trace modules */ + memset(tscratch, 0, size); +} + static int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { @@ -9236,21 +9322,15 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size buf->tr = tr; if (tr->range_addr_start && tr->range_addr_size) { + /* Add scratch buffer to handle 128 modules */ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, tr->range_addr_size, - sizeof(*tscratch)); + struct_size(tscratch, entries, 128)); tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); - if (tscratch) { - tr->scratch = tscratch; - tr->scratch_size = scratch_size; + setup_trace_scratch(tr, tscratch, scratch_size); -#ifdef CONFIG_RANDOMIZE_BASE - if (tscratch->kaslr_addr) - tr->text_delta = kaslr_offset() - tscratch->kaslr_addr; -#endif - } /* * This is basically the same as a mapped buffer, * with the same restrictions. From 1bd25a6f7173c005844cd0076718b3c87bccd891 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:46 -0500 Subject: [PATCH 08/17] tracing: Show module names and addresses of last boot Add the last boot module's names and addresses to the last_boot_info file. This only shows the module information from a previous boot. If the buffer is started and is recording the current boot, this file still will only show "current". ~# cat instances/boot_mapped/last_boot_info 10c00000 [kernel] ffffffffc00ca000 usb_serial_simple ffffffffc00ae000 usbserial ffffffffc008b000 bfq ~# echo function > instances/boot_mapped/current_tracer ~# cat instances/boot_mapped/last_boot_info # Current Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164609.299186021@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 102 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a75e03994312..d22f8d34b18d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5999,6 +5999,8 @@ struct trace_scratch { struct trace_mod_entry entries[]; }; +static DEFINE_MUTEX(scratch_mutex); + static int save_mod(struct module *mod, void *data) { struct trace_array *tr = data; @@ -6039,6 +6041,7 @@ static void update_last_data(struct trace_array *tr) flex_array_size(tscratch, entries, tscratch->nr_entries)); tscratch->nr_entries = 0; + guard(mutex)(&scratch_mutex); module_for_each_mod(save_mod, tr); } @@ -6876,15 +6879,47 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +#define LAST_BOOT_HEADER ((void *)1) + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_array *tr = filp->private_data; + struct trace_array *tr = m->private; struct trace_scratch *tscratch = tr->scratch; - struct seq_buf seq; - char buf[64]; + unsigned int index = *pos; + + (*pos)++; - seq_buf_init(&seq, buf, 64); + if (*pos == 1) + return LAST_BOOT_HEADER; + + /* Only show offsets of the last boot data */ + if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return NULL; + + /* *pos 0 is for the header, 1 is for the first module */ + index--; + + if (index >= tscratch->nr_entries) + return NULL; + + return &tscratch->entries[index]; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&scratch_mutex); + + return l_next(m, NULL, pos); +} + +static void l_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&scratch_mutex); +} + +static void show_last_boot_header(struct seq_file *m, struct trace_array *tr) +{ + struct trace_scratch *tscratch = tr->scratch; /* * Do not leak KASLR address. This only shows the KASLR address of @@ -6894,11 +6929,52 @@ tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t * should not be the same as the current boot. */ if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) - seq_buf_printf(&seq, "%lx\t[kernel]\n", tscratch->kaslr_addr); + seq_printf(m, "%lx\t[kernel]\n", tscratch->kaslr_addr); else - seq_buf_puts(&seq, "# Current\n"); + seq_puts(m, "# Current\n"); +} - return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq)); +static int l_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + struct trace_mod_entry *entry = v; + + if (v == LAST_BOOT_HEADER) { + show_last_boot_header(m, tr); + return 0; + } + + seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name); + return 0; +} + +static const struct seq_operations last_boot_seq_ops = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show, +}; + +static int tracing_last_boot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = seq_open(file, &last_boot_seq_ops); + if (ret) { + trace_array_put(tr); + return ret; + } + + m = file->private_data; + m->private = tr; + + return 0; } static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) @@ -7527,10 +7603,10 @@ static const struct file_operations trace_time_stamp_mode_fops = { }; static const struct file_operations last_boot_fops = { - .open = tracing_open_generic_tr, - .read = tracing_last_boot_read, - .llseek = generic_file_llseek, - .release = tracing_release_generic_tr, + .open = tracing_last_boot_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_seq_release, }; #ifdef CONFIG_TRACER_SNAPSHOT From 5f3719f697c3fdfae5cd6805f10ac7a04b0f4e43 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:47 -0500 Subject: [PATCH 09/17] tracing: Update modules to persistent instances when loaded When a module is loaded and a persistent buffer is actively tracing, add it to the list of modules in the persistent memory. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164609.469844721@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 27 +++++++++++++++++++++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 40 ++++++++++++++++++++++++++----------- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d22f8d34b18d..66c1683fa1dd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10087,6 +10087,32 @@ static void trace_module_remove_evals(struct module *mod) static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ +static bool trace_array_active(struct trace_array *tr) +{ + if (tr->current_trace != &nop_trace) + return true; + + /* 0 is no events, 1 is all disabled */ + return trace_events_enabled(tr, NULL) > 1; +} + +static void trace_module_record(struct module *mod) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + /* Update any persistent trace array that has already been started */ + if ((tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT)) == + TRACE_ARRAY_FL_BOOT) { + /* Only update if the trace array is active */ + if (trace_array_active(tr)) { + guard(mutex)(&scratch_mutex); + save_mod(mod, tr); + } + } + } +} + static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -10095,6 +10121,7 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); + trace_module_record(mod); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3a020fb82a34..90493220c362 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -786,6 +786,8 @@ extern void trace_find_cmdline(int pid, char comm[]); extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); +extern int trace_events_enabled(struct trace_array *tr, const char *system); + #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 513de9ceb80e..7b3ef1d26167 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1818,28 +1818,28 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } -static ssize_t -system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +/* + * Returns: + * 0 : no events exist? + * 1 : all events are disabled + * 2 : all events are enabled + * 3 : some events are enabled and some are enabled + */ +int trace_events_enabled(struct trace_array *tr, const char *system) { - const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct trace_subsystem_dir *dir = filp->private_data; - struct event_subsystem *system = dir->subsystem; struct trace_event_call *call; struct trace_event_file *file; - struct trace_array *tr = dir->tr; - char buf[2]; int set = 0; - int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); + list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system->name) != 0) + if (system && strcmp(call->class->system, system) != 0) continue; /* @@ -1855,7 +1855,23 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (set == 3) break; } - mutex_unlock(&event_mutex); + + return set; +} + +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + struct trace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; + char buf[2]; + int set; + int ret; + + set = trace_events_enabled(tr, system ? system->name : NULL); buf[0] = set_to_char[set]; buf[1] = '\n'; From 74e2498ccf7b303e7fdd881f58a849e884afb486 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 19 Feb 2025 00:08:58 +0900 Subject: [PATCH 10/17] mm/memblock: Add reserved memory release function Add reserve_mem_release_by_name() to release a reserved memory region with a given name. This allows us to release reserved memory which is defined by kernel cmdline, after boot. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Mike Rapoport (Microsoft) Cc: Andrew Morton Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: linux-mm@kvack.org Link: https://lore.kernel.org/173989133862.230693.14094993331347437600.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/mm.h | 1 + mm/memblock.c | 66 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..1ee9e7447485 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4123,6 +4123,7 @@ void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); +int reserve_mem_release_by_name(const char *name); #ifdef CONFIG_64BIT int do_mseal(unsigned long start, size_t len_in, unsigned long flags); diff --git a/mm/memblock.c b/mm/memblock.c index 95af35fd1389..8cd95f60015d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -2283,6 +2284,7 @@ struct reserve_mem_table { }; static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES]; static int reserved_mem_count; +static DEFINE_MUTEX(reserve_mem_lock); /* Add wildcard region with a lookup name */ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, @@ -2296,6 +2298,21 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, strscpy(map->name, name); } +static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name) +{ + struct reserve_mem_table *map; + int i; + + for (i = 0; i < reserved_mem_count; i++) { + map = &reserved_mem_table[i]; + if (!map->size) + continue; + if (strcmp(name, map->name) == 0) + return map; + } + return NULL; +} + /** * reserve_mem_find_by_name - Find reserved memory region with a given name * @name: The name that is attached to a reserved memory region @@ -2309,22 +2326,47 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size) { struct reserve_mem_table *map; - int i; - for (i = 0; i < reserved_mem_count; i++) { - map = &reserved_mem_table[i]; - if (!map->size) - continue; - if (strcmp(name, map->name) == 0) { - *start = map->start; - *size = map->size; - return 1; - } - } - return 0; + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + *start = map->start; + *size = map->size; + return 1; } EXPORT_SYMBOL_GPL(reserve_mem_find_by_name); +/** + * reserve_mem_release_by_name - Release reserved memory region with a given name + * @name: The name that is attatched to a reserved memory region + * + * Forcibly release the pages in the reserved memory region so that those memory + * can be used as free memory. After released the reserved region size becomes 0. + * + * Returns: 1 if released or 0 if not found. + */ +int reserve_mem_release_by_name(const char *name) +{ + char buf[RESERVE_MEM_NAME_SIZE + 12]; + struct reserve_mem_table *map; + void *start, *end; + + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + start = phys_to_virt(map->start); + end = start + map->size - 1; + snprintf(buf, sizeof(buf), "reserve_mem:%s", name); + free_reserved_area(start, end, 0, buf); + map->size = 0; + + return 1; +} + /* * Parse reserve_mem=nn:align:name */ From fb6d03238e35f96cc1d6a5411ee1d684221d1c39 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 19 Feb 2025 00:09:08 +0900 Subject: [PATCH 11/17] tracing: Freeable reserved ring buffer Make the ring buffer on reserved memory to be freeable. This allows us to free the trace instance on the reserved memory without changing cmdline and rebooting. Even if we can not change the kernel cmdline for security reason, we can release the reserved memory for the ring buffer as free (available) memory. For example, boot kernel with reserved memory; "reserve_mem=20M:2M:trace trace_instance=boot_mapped^traceoff@trace" ~ # free total used free shared buff/cache available Mem: 1995548 50544 1927568 14964 17436 1911480 Swap: 0 0 0 ~ # rmdir /sys/kernel/tracing/instances/boot_mapped/ [ 23.704023] Freeing reserve_mem:trace memory: 20476K ~ # free total used free shared buff/cache available Mem: 2016024 41844 1956740 14968 17440 1940572 Swap: 0 0 0 Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Mike Rapoport Link: https://lore.kernel.org/173989134814.230693.18199312930337815629.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 13 ++++++++++++- kernel/trace/trace.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 66c1683fa1dd..e7a14bb7d7aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9479,6 +9479,9 @@ static void free_trace_buffers(struct trace_array *tr) #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); #endif + + if (tr->range_addr_start) + vunmap((void *)tr->range_addr_start); } static void init_trace_flags_index(struct trace_array *tr) @@ -9640,6 +9643,7 @@ trace_array_create_systems(const char *name, const char *systems, free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); + kfree(tr->range_name); kfree(tr->name); kfree(tr); @@ -9766,6 +9770,11 @@ static int __remove_instance(struct trace_array *tr) free_trace_buffers(tr); clear_tracing_err_log(tr); + if (tr->range_name) { + reserve_mem_release_by_name(tr->range_name); + kfree(tr->range_name); + } + for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); } @@ -10590,6 +10599,7 @@ __init static void enable_instances(void) bool traceoff = false; char *flag_delim; char *addr_delim; + char *rname __free(kfree) = NULL; tok = strsep(&curr_str, ","); @@ -10646,6 +10656,7 @@ __init static void enable_instances(void) pr_warn("Failed to map boot instance %s to %s\n", name, tok); continue; } + rname = kstrdup(tok, GFP_KERNEL); } if (start) { @@ -10682,7 +10693,7 @@ __init static void enable_instances(void) */ if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; - tr->ref++; + tr->range_name = no_free_ptr(rname); } while ((tok = strsep(&curr_str, ","))) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90493220c362..0d6efb8a1179 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -348,6 +348,7 @@ struct trace_array { unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; + char *range_name; long text_delta; void *scratch; /* pointer in persistent memory */ int scratch_size; From f00c9201f942dddb58617c881ffd4e4a1a1c49ab Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 18 Mar 2025 22:39:13 +0900 Subject: [PATCH 12/17] tracing: Fix a compilation error without CONFIG_MODULES There are some code which depends on CONFIG_MODULES. #ifdef to enclose it. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/174230515367.2909896.8132122175220657625.stgit@mhiramat.tok.corp.google.com Fixes: dca91c1c5468 ("tracing: Have persistent trace instances save module addresses") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e7a14bb7d7aa..7f8b0c43d2a5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6001,6 +6001,7 @@ struct trace_scratch { static DEFINE_MUTEX(scratch_mutex); +#ifdef CONFIG_MODULES static int save_mod(struct module *mod, void *data) { struct trace_array *tr = data; @@ -6025,6 +6026,12 @@ static int save_mod(struct module *mod, void *data) return 0; } +#else +static int save_mod(struct module *mod, void *data) +{ + return 0; +} +#endif static void update_last_data(struct trace_array *tr) { From 5dbeb56bb9589e1051f6af6877cd375f3a901afb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Mar 2025 07:16:18 -0400 Subject: [PATCH 13/17] tracing: Initialize scratch_size to zero to prevent UB In allocate_trace_buffer() the following code: buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, tr->range_addr_size, struct_size(tscratch, entries, 128)); tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); setup_trace_scratch(tr, tscratch, scratch_size); Has undefined behavior if ring_buffer_alloc_range() fails because "scratch_size" is not initialize. If the allocation fails, then buf->buffer will be NULL. The ring_buffer_meta_scratch() will return NULL immediately if it is passed a NULL buffer and it will not update scratch_size. Then setup_trace_scratch() will return immediately if tscratch is NULL. Although there's no real issue here, but it is considered undefined behavior to pass an uninitialized variable to a function as input, and UBSan may complain about it. Just initialize scratch_size to zero to make the code defined behavior and a little more robust. Link: https://lore.kernel.org/all/44c5deaa-b094-4852-90f9-52f3fb10e67a@stanley.mountain/ Reported-by: Dan Carpenter Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7f8b0c43d2a5..78ae76666695 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9398,7 +9398,7 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size { enum ring_buffer_flags rb_flags; struct trace_scratch *tscratch; - unsigned int scratch_size; + unsigned int scratch_size = 0; rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; From 486fbcb3806c0c7a5dbeea326c4a146fd4ed4eff Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Mar 2025 10:30:03 +0900 Subject: [PATCH 14/17] tracing: Skip update_last_data() if cleared and remove active check for save_mod() If the last boot data is already cleared, there is no reason to update it again. Skip if the TRACE_ARRAY_FL_LAST_BOOT is cleared. Also, for calling save_mod() when module loading, we don't need to check the trace is active or not because any module address can be on the stacktrace. Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/174165660328.1173316.15529357882704817499.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78ae76666695..382c7a562303 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6040,6 +6040,12 @@ static void update_last_data(struct trace_array *tr) if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) return; + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return; + + /* Only if the buffer has previous boot data clear and update it. */ + tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; + /* Reset the module list and reload them */ if (tr->scratch) { struct trace_scratch *tscratch = tr->scratch; @@ -6052,9 +6058,6 @@ static void update_last_data(struct trace_array *tr) module_for_each_mod(save_mod, tr); } - if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) - return; - /* * Need to clear all CPU buffers as there cannot be events * from the previous boot mixed with events with this boot @@ -6077,7 +6080,6 @@ static void update_last_data(struct trace_array *tr) #else tscratch->kaslr_addr = 0; #endif - tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; } /** @@ -10103,15 +10105,6 @@ static void trace_module_remove_evals(struct module *mod) static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ -static bool trace_array_active(struct trace_array *tr) -{ - if (tr->current_trace != &nop_trace) - return true; - - /* 0 is no events, 1 is all disabled */ - return trace_events_enabled(tr, NULL) > 1; -} - static void trace_module_record(struct module *mod) { struct trace_array *tr; @@ -10120,11 +10113,8 @@ static void trace_module_record(struct module *mod) /* Update any persistent trace array that has already been started */ if ((tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT)) == TRACE_ARRAY_FL_BOOT) { - /* Only update if the trace array is active */ - if (trace_array_active(tr)) { - guard(mutex)(&scratch_mutex); - save_mod(mod, tr); - } + guard(mutex)(&scratch_mutex); + save_mod(mod, tr); } } } From de48d7fff7b4668a61c3c1d13ca0f6a6b3995519 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 17 Mar 2025 09:55:24 +0800 Subject: [PATCH 15/17] ring-buffer: Remove the unused variable bmeta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable bmeta is not effectively used, so delete it. kernel/trace/ring_buffer.c:1952:27: warning: variable ‘bmeta’ set but not used. Link: https://lore.kernel.org/20250317015524.3902-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=19524 Signed-off-by: Jiapeng Chong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b981eb20c206..f25966b3a1fc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1949,7 +1949,6 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { struct ring_buffer_cpu_meta *meta; - struct ring_buffer_meta *bmeta; unsigned long *subbuf_mask; unsigned long delta; void *subbuf; @@ -1964,8 +1963,6 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int sc if (rb_meta_init(buffer, scratch_size)) valid = true; - bmeta = buffer->meta; - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; From 35a380ddbc653c07ee64e2a74c274b9835b0efc2 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 24 Mar 2025 23:34:52 +0900 Subject: [PATCH 16/17] tracing: Show last module text symbols in the stacktrace Since the previous boot trace buffer can include module text address in the stacktrace. As same as the kernel text address, convert the module text address using the module address information. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/174282689201.356346.17647540360450727687.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 133 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 8 +++ kernel/trace/trace_output.c | 4 +- 3 files changed, 138 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 382c7a562303..fa17397fdc1f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,7 @@ #include #include #include +#include #include /* COMMAND_LINE_SIZE and kaslr_offset() */ @@ -6001,6 +6002,59 @@ struct trace_scratch { static DEFINE_MUTEX(scratch_mutex); +static int cmp_mod_entry(const void *key, const void *pivot) +{ + unsigned long addr = (unsigned long)key; + const struct trace_mod_entry *ent = pivot; + + if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr) + return 0; + else + return addr - ent->mod_addr; +} + +/** + * trace_adjust_address() - Adjust prev boot address to current address. + * @tr: Persistent ring buffer's trace_array. + * @addr: Address in @tr which is adjusted. + */ +unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + int idx = 0, nr_entries; + + /* If we don't have last boot delta, return the address */ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return addr; + + /* tr->module_delta must be protected by rcu. */ + guard(rcu)(); + tscratch = tr->scratch; + /* if there is no tscrach, module_delta must be NULL. */ + module_delta = READ_ONCE(tr->module_delta); + if (!module_delta || tscratch->entries[0].mod_addr > addr) + return addr + tr->text_delta; + + /* Note that entries must be sorted. */ + nr_entries = tscratch->nr_entries; + if (nr_entries == 1 || + tscratch->entries[nr_entries - 1].mod_addr < addr) + idx = nr_entries - 1; + else { + entry = __inline_bsearch((void *)addr, + tscratch->entries, + nr_entries - 1, + sizeof(tscratch->entries[0]), + cmp_mod_entry); + if (entry) + idx = entry - tscratch->entries; + } + + return addr + module_delta->delta[idx]; +} + #ifdef CONFIG_MODULES static int save_mod(struct module *mod, void *data) { @@ -6035,6 +6089,7 @@ static int save_mod(struct module *mod, void *data) static void update_last_data(struct trace_array *tr) { + struct trace_module_delta *module_delta; struct trace_scratch *tscratch; if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) @@ -6073,6 +6128,9 @@ static void update_last_data(struct trace_array *tr) return; tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + WRITE_ONCE(tr->module_delta, NULL); + kfree_rcu(module_delta, rcu); /* Set the persistent ring buffer meta data to this address */ #ifdef CONFIG_RANDOMIZE_BASE @@ -9355,10 +9413,51 @@ static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); +#ifdef CONFIG_MODULES +static int make_mod_delta(struct module *mod, void *data) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + struct trace_array *tr = data; + int i; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + for (i = 0; i < tscratch->nr_entries; i++) { + entry = &tscratch->entries[i]; + if (strcmp(mod->name, entry->mod_name)) + continue; + if (mod->state == MODULE_STATE_GOING) + module_delta->delta[i] = 0; + else + module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base + - entry->mod_addr; + break; + } + return 0; +} +#else +static int make_mod_delta(struct module *mod, void *data) +{ + return 0; +} +#endif + +static int mod_addr_comp(const void *a, const void *b, const void *data) +{ + const struct trace_mod_entry *e1 = a; + const struct trace_mod_entry *e2 = b; + + return e1->mod_addr > e2->mod_addr ? 1 : -1; +} + static void setup_trace_scratch(struct trace_array *tr, struct trace_scratch *tscratch, unsigned int size) { + struct trace_module_delta *module_delta; struct trace_mod_entry *entry; + int i, nr_entries; if (!tscratch) return; @@ -9375,7 +9474,7 @@ static void setup_trace_scratch(struct trace_array *tr, goto reset; /* Check if each module name is a valid string */ - for (int i = 0; i < tscratch->nr_entries; i++) { + for (i = 0; i < tscratch->nr_entries; i++) { int n; entry = &tscratch->entries[i]; @@ -9389,6 +9488,25 @@ static void setup_trace_scratch(struct trace_array *tr, if (n == MODULE_NAME_LEN) goto reset; } + + /* Sort the entries so that we can find appropriate module from address. */ + nr_entries = tscratch->nr_entries; + sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry), + mod_addr_comp, NULL, NULL); + + if (IS_ENABLED(CONFIG_MODULES)) { + module_delta = kzalloc(struct_size(module_delta, delta, nr_entries), GFP_KERNEL); + if (!module_delta) { + pr_info("module_delta allocation failed. Not able to decode module address."); + goto reset; + } + init_rcu_head(&module_delta->rcu); + } else + module_delta = NULL; + WRITE_ONCE(tr->module_delta, module_delta); + + /* Scan modules to make text delta for modules. */ + module_for_each_mod(make_mod_delta, tr); return; reset: /* Invalid trace modules */ @@ -10105,16 +10223,20 @@ static void trace_module_remove_evals(struct module *mod) static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ -static void trace_module_record(struct module *mod) +static void trace_module_record(struct module *mod, bool add) { struct trace_array *tr; + unsigned long flags; list_for_each_entry(tr, &ftrace_trace_arrays, list) { + flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT); /* Update any persistent trace array that has already been started */ - if ((tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT)) == - TRACE_ARRAY_FL_BOOT) { + if (flags == TRACE_ARRAY_FL_BOOT && add) { guard(mutex)(&scratch_mutex); save_mod(mod, tr); + } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) { + /* Update delta if the module loaded in previous boot */ + make_mod_delta(mod, tr); } } } @@ -10127,10 +10249,11 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); - trace_module_record(mod); + trace_module_record(mod, true); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); + trace_module_record(mod, false); break; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0d6efb8a1179..ab7c7a1930cc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -312,6 +312,11 @@ struct trace_func_repeats { u64 ts_last_call; }; +struct trace_module_delta { + struct rcu_head rcu; + long delta[]; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -350,6 +355,7 @@ struct trace_array { unsigned long range_addr_size; char *range_name; long text_delta; + struct trace_module_delta *module_delta; void *scratch; /* pointer in persistent memory */ int scratch_size; @@ -466,6 +472,8 @@ extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); +extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 03d56f711ad1..1ad54fcf25cb 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -5,6 +5,7 @@ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt * */ +#include "trace.h" #include #include #include @@ -1248,7 +1249,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; unsigned long *p; unsigned long *end; - long delta = iter->tr->text_delta; trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); @@ -1265,7 +1265,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); continue; } - seq_print_ip_sym(s, (*p) + delta, flags); + seq_print_ip_sym(s, trace_adjust_address(iter->tr, *p), flags); trace_seq_putc(s, '\n'); } From 028a58ec154257e618c27fb0eba8d9e30379bc3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Mar 2025 22:03:04 -0400 Subject: [PATCH 17/17] tracing: Use _text and the kernel offset in last_boot_info Instead of using kaslr_offset() just record the location of "_text". This makes it possible for user space to use either the System.map or /proc/kallsyms as what to map all addresses to functions with. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250326220304.38dbedcd@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa17397fdc1f..14c38fcd6f9e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -51,7 +51,7 @@ #include #include -#include /* COMMAND_LINE_SIZE and kaslr_offset() */ +#include /* COMMAND_LINE_SIZE */ #include "trace.h" #include "trace_output.h" @@ -5995,7 +5995,7 @@ struct trace_mod_entry { }; struct trace_scratch { - unsigned long kaslr_addr; + unsigned long text_addr; unsigned long nr_entries; struct trace_mod_entry entries[]; }; @@ -6133,11 +6133,7 @@ static void update_last_data(struct trace_array *tr) kfree_rcu(module_delta, rcu); /* Set the persistent ring buffer meta data to this address */ -#ifdef CONFIG_RANDOMIZE_BASE - tscratch->kaslr_addr = kaslr_offset(); -#else - tscratch->kaslr_addr = 0; -#endif + tscratch->text_addr = (unsigned long)_text; } /** @@ -6996,7 +6992,7 @@ static void show_last_boot_header(struct seq_file *m, struct trace_array *tr) * should not be the same as the current boot. */ if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) - seq_printf(m, "%lx\t[kernel]\n", tscratch->kaslr_addr); + seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr); else seq_puts(m, "# Current\n"); } @@ -9465,10 +9461,8 @@ static void setup_trace_scratch(struct trace_array *tr, tr->scratch = tscratch; tr->scratch_size = size; -#ifdef CONFIG_RANDOMIZE_BASE - if (tscratch->kaslr_addr) - tr->text_delta = kaslr_offset() - tscratch->kaslr_addr; -#endif + if (tscratch->text_addr) + tr->text_delta = (unsigned long)_text - tscratch->text_addr; if (struct_size(tscratch, entries, tscratch->nr_entries) > size) goto reset;