From dd4900d94f2f7bf3ccfdd4479a3d159e93cca2de Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:35 -0400 Subject: [PATCH 01/28] ring-buffer: Allow mapped field to be set without mapping In preparation for having the ring buffer mapped to a dedicated location, which will have the same restrictions as user space memory mapped buffers, allow it to use the "mapped" field of the ring_buffer_per_cpu structure without having the user space meta page mapping. When this starts using the mapped field, it will need to handle adding a user space mapping (and removing it) from a ring buffer that is using a dedicated memory range. Link: https://lkml.kernel.org/r/20240612232025.190908567@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 28853966aa9af..a240bdc0f2d87 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -491,6 +491,7 @@ struct ring_buffer_per_cpu { unsigned long pages_removed; unsigned int mapped; + unsigned int user_mapped; /* user space mapping */ struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; @@ -5224,6 +5225,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; + if (!meta) + return; + meta->reader.read = cpu_buffer->reader_page->read; meta->reader.id = cpu_buffer->reader_page->id; meta->reader.lost_events = cpu_buffer->lost_events; @@ -5280,7 +5284,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; - if (cpu_buffer->mapped) + if (cpu_buffer->user_mapped) rb_update_meta_page(cpu_buffer); rb_head_page_activate(cpu_buffer); @@ -6167,7 +6171,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) mutex_lock(&cpu_buffer->mapping_lock); - if (!cpu_buffer->mapped) { + if (!cpu_buffer->user_mapped) { mutex_unlock(&cpu_buffer->mapping_lock); return ERR_PTR(-ENODEV); } @@ -6191,19 +6195,26 @@ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, lockdep_assert_held(&cpu_buffer->mapping_lock); + /* mapped is always greater or equal to user_mapped */ + if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped)) + return -EINVAL; + if (inc && cpu_buffer->mapped == UINT_MAX) return -EBUSY; - if (WARN_ON(!inc && cpu_buffer->mapped == 0)) + if (WARN_ON(!inc && cpu_buffer->user_mapped == 0)) return -EINVAL; mutex_lock(&cpu_buffer->buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - if (inc) + if (inc) { + cpu_buffer->user_mapped++; cpu_buffer->mapped++; - else + } else { + cpu_buffer->user_mapped--; cpu_buffer->mapped--; + } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); mutex_unlock(&cpu_buffer->buffer->mutex); @@ -6328,7 +6339,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, mutex_lock(&cpu_buffer->mapping_lock); - if (cpu_buffer->mapped) { + if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); @@ -6359,12 +6370,15 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); err = __rb_map_vma(cpu_buffer, vma); if (!err) { raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - cpu_buffer->mapped = 1; + /* This is the first time it is mapped by user */ + cpu_buffer->mapped++; + cpu_buffer->user_mapped = 1; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } else { kfree(cpu_buffer->subbuf_ids); @@ -6392,10 +6406,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) mutex_lock(&cpu_buffer->mapping_lock); - if (!cpu_buffer->mapped) { + if (!cpu_buffer->user_mapped) { err = -ENODEV; goto out; - } else if (cpu_buffer->mapped > 1) { + } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); goto out; } @@ -6403,7 +6417,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) mutex_lock(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - cpu_buffer->mapped = 0; + /* This is the last user space mapping */ + if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped)) + cpu_buffer->mapped--; + cpu_buffer->user_mapped = 0; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); From be68d63a139bd4a0eae44d06234eaff8c07d207c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:36 -0400 Subject: [PATCH 02/28] ring-buffer: Add ring_buffer_alloc_range() In preparation to allowing the trace ring buffer to be allocated in a range of memory that is persistent across reboots, add ring_buffer_alloc_range(). It takes a contiguous range of memory and will split it up evenly for the per CPU ring buffers. If there's not enough memory to handle all CPUs with the minimum size, it will fail to allocate the ring buffer. Link: https://lkml.kernel.org/r/20240612232025.363998725@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 17 +++ kernel/trace/ring_buffer.c | 239 ++++++++++++++++++++++++++++++------ 2 files changed, 220 insertions(+), 36 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 96d2140b471ed..a50b0223b1d32 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -89,6 +89,11 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, struct trace_buffer * __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key); +struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, + int order, unsigned long start, + unsigned long range_size, + struct lock_class_key *key); + /* * Because the ring buffer is generic, if other users of the ring buffer get * traced by ftrace, it can produce lockdep warnings. We need to keep each @@ -100,6 +105,18 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) +/* + * Because the ring buffer is generic, if other users of the ring buffer get + * traced by ftrace, it can produce lockdep warnings. We need to keep each + * ring buffer's lock class separate. + */ +#define ring_buffer_alloc_range(size, flags, order, start, range_size) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc_range((size), (flags), (order), (start), \ + (range_size), &__key); \ +}) + typedef bool (*ring_buffer_cond_fn)(void *data); int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, ring_buffer_cond_fn cond, void *data); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a240bdc0f2d87..6edd70e458071 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -42,6 +42,9 @@ static void update_pages_handler(struct work_struct *work); +struct ring_buffer_meta { +}; + /* * The ring buffer header is special. We must manually up keep it. */ @@ -342,7 +345,8 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ - u32 id; /* ID for external mapping */ + u32 id:30; /* ID for external mapping */ + u32 range:1; /* Mapped via a range */ struct buffer_data_page *page; /* Actual data page */ }; @@ -373,7 +377,9 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) static void free_buffer_page(struct buffer_page *bpage) { - free_pages((unsigned long)bpage->page, bpage->order); + /* Range pages are not to be freed */ + if (!bpage->range) + free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } @@ -524,6 +530,9 @@ struct trace_buffer { struct rb_irq_work irq_work; bool time_stamp_abs; + unsigned long range_addr_start; + unsigned long range_addr_end; + unsigned int subbuf_size; unsigned int subbuf_order; unsigned int max_data_size; @@ -1491,9 +1500,70 @@ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) } } +/* + * Take an address, add the meta data size as well as the array of + * array subbuffer indexes, then align it to a subbuffer size. + * + * This is used to help find the next per cpu subbuffer within a mapped range. + */ +static unsigned long +rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) +{ + addr += sizeof(struct ring_buffer_meta) + + sizeof(int) * nr_subbufs; + return ALIGN(addr, subbuf_size); +} + +/* + * Return a specific sub-buffer for a given @cpu defined by @idx. + */ +static void *rb_range_buffer(struct trace_buffer *buffer, int cpu, int nr_pages, int idx) +{ + unsigned long ptr; + int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + int nr_subbufs; + + /* Include the reader page */ + nr_subbufs = nr_pages + 1; + + /* + * The first chunk may not be subbuffer aligned, where as + * the rest of the chunks are. + */ + ptr = buffer->range_addr_start; + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + if (cpu) { + unsigned long p; + + ptr += subbuf_size * nr_subbufs; + + /* Save the beginning of this CPU chunk */ + p = ptr; + + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + + /* We can use multiplication to find chunks greater than 1 */ + if (cpu > 1) { + unsigned long size; + + ptr += subbuf_size * nr_subbufs; + + /* Now all chunks after this are the same size */ + size = ptr - p; + ptr += size * (cpu - 2); + + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + } + } + if (ptr + subbuf_size * nr_subbufs > buffer->range_addr_end) + return NULL; + return (void *)ptr; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { + struct trace_buffer *buffer = cpu_buffer->buffer; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; @@ -1530,6 +1600,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, set_current_oom_origin(); for (i = 0; i < nr_pages; i++) { struct page *page; + int cpu = cpu_buffer->cpu; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), mflags, cpu_to_node(cpu_buffer->cpu)); @@ -1538,14 +1609,26 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_check_bpage(cpu_buffer, bpage); - list_add(&bpage->list, pages); - - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) - goto free_pages; - bpage->page = page_address(page); + /* + * Append the pages as for mapped buffers we want to keep + * the order + */ + list_add_tail(&bpage->list, pages); + + if (buffer->range_addr_start) { + /* A range was given. Use that for the buffer page */ + bpage->page = rb_range_buffer(buffer, cpu, nr_pages, i + 1); + if (!bpage->page) + goto free_pages; + bpage->range = 1; + } else { + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + mflags | __GFP_COMP | __GFP_ZERO, + cpu_buffer->buffer->subbuf_order); + if (!page) + goto free_pages; + bpage->page = page_address(page); + } bpage->order = cpu_buffer->buffer->subbuf_order; rb_init_page(bpage->page); @@ -1627,11 +1710,19 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) - goto fail_free_reader; - bpage->page = page_address(page); + if (buffer->range_addr_start) { + bpage->page = rb_range_buffer(buffer, cpu, nr_pages, 0); + if (!bpage->page) + goto fail_free_reader; + bpage->range = 1; + } else { + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_COMP | __GFP_ZERO, + cpu_buffer->buffer->subbuf_order); + if (!page) + goto fail_free_reader; + bpage->page = page_address(page); + } rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -1682,22 +1773,14 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -/** - * __ring_buffer_alloc - allocate a new ring_buffer - * @size: the size in bytes per cpu that is needed. - * @flags: attributes to set for the ring buffer. - * @key: ring buffer reader_lock_key. - * - * Currently the only flag that is available is the RB_FL_OVERWRITE - * flag. This flag means that the buffer will overwrite old data - * when the buffer wraps. If this flag is not set, the buffer will - * drop data when the tail hits the head. - */ -struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, - struct lock_class_key *key) +static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, + int order, unsigned long start, + unsigned long end, + struct lock_class_key *key) { struct trace_buffer *buffer; long nr_pages; + int subbuf_size; int bsize; int cpu; int ret; @@ -1711,14 +1794,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - /* Default buffer page size - one system page */ - buffer->subbuf_order = 0; - buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; + buffer->subbuf_order = order; + subbuf_size = (PAGE_SIZE << order); + buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE; /* Max payload is buffer page size - header (8bytes) */ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); - nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; @@ -1726,10 +1808,6 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&buffer->irq_work.waiters); - /* need at least two pages */ - if (nr_pages < 2) - nr_pages = 2; - buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -1738,6 +1816,54 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + /* If start/end are specified, then that overrides size */ + if (start && end) { + unsigned long ptr; + int n; + + size = end - start; + size = size / nr_cpu_ids; + + /* + * The number of sub-buffers (nr_pages) is determined by the + * total size allocated minus the meta data size. + * Then that is divided by the number of per CPU buffers + * needed, plus account for the integer array index that + * will be appended to the meta data. + */ + nr_pages = (size - sizeof(struct ring_buffer_meta)) / + (subbuf_size + sizeof(int)); + /* Need at least two pages plus the reader page */ + if (nr_pages < 3) + goto fail_free_buffers; + + again: + /* Make sure that the size fits aligned */ + for (n = 0, ptr = start; n < nr_cpu_ids; n++) { + ptr += sizeof(struct ring_buffer_meta) + + sizeof(int) * nr_pages; + ptr = ALIGN(ptr, subbuf_size); + ptr += subbuf_size * nr_pages; + } + if (ptr > end) { + if (nr_pages <= 3) + goto fail_free_buffers; + nr_pages--; + goto again; + } + + /* nr_pages should not count the reader page */ + nr_pages--; + buffer->range_addr_start = start; + buffer->range_addr_end = end; + } else { + + /* need at least two pages */ + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); + if (nr_pages < 2) + nr_pages = 2; + } + cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); @@ -1766,8 +1892,49 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, kfree(buffer); return NULL; } + +/** + * __ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes per cpu that is needed. + * @flags: attributes to set for the ring buffer. + * @key: ring buffer reader_lock_key. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) +{ + /* Default buffer page size - one system page */ + return alloc_buffer(size, flags, 0, 0, 0,key); + +} EXPORT_SYMBOL_GPL(__ring_buffer_alloc); +/** + * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory + * @size: the size in bytes per cpu that is needed. + * @flags: attributes to set for the ring buffer. + * @start: start of allocated range + * @range_size: size of allocated range + * @order: sub-buffer order + * @key: ring buffer reader_lock_key. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, + int order, unsigned long start, + unsigned long range_size, + struct lock_class_key *key) +{ + return alloc_buffer(size, flags, order, start, start + range_size, key); +} + /** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. From b14d032973d4e6c36e025e96ea3437de53d072d9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:37 -0400 Subject: [PATCH 03/28] ring-buffer: Add ring_buffer_meta data Populate the ring_buffer_meta array. It holds the pointer to the head_buffer (next to read), the commit_buffer (next to write) the size of the sub-buffers, number of sub-buffers and an array that keeps track of the order of the sub-buffers. This information will be stored in the persistent memory to help on reboot to reconstruct the ring buffer. Link: https://lkml.kernel.org/r/20240612232025.530733577@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 209 ++++++++++++++++++++++++++++++++----- 1 file changed, 184 insertions(+), 25 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6edd70e458071..0e3ed7f1cc5df 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -43,6 +43,11 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { + unsigned long head_buffer; + unsigned long commit_buffer; + __u32 subbuf_size; + __u32 nr_subbufs; + int buffers[]; }; /* @@ -501,6 +506,7 @@ struct ring_buffer_per_cpu { struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; + struct ring_buffer_meta *ring_meta; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -1261,6 +1267,11 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) * Set the previous list pointer to have the HEAD flag. */ rb_set_list_to_head(head->list.prev); + + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->head_buffer = (unsigned long)head->page; + } } static void rb_list_head_clear(struct list_head *list) @@ -1515,51 +1526,127 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) } /* - * Return a specific sub-buffer for a given @cpu defined by @idx. + * Return the ring_buffer_meta for a given @cpu. */ -static void *rb_range_buffer(struct trace_buffer *buffer, int cpu, int nr_pages, int idx) +static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { - unsigned long ptr; int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + unsigned long ptr = buffer->range_addr_start; + struct ring_buffer_meta *meta; int nr_subbufs; - /* Include the reader page */ - nr_subbufs = nr_pages + 1; + if (!ptr) + return NULL; + + /* When nr_pages passed in is zero, the first meta has already been initialized */ + if (!nr_pages) { + meta = (struct ring_buffer_meta *)ptr; + nr_subbufs = meta->nr_subbufs; + } else { + meta = NULL; + /* Include the reader page */ + nr_subbufs = nr_pages + 1; + } /* * The first chunk may not be subbuffer aligned, where as * the rest of the chunks are. */ - ptr = buffer->range_addr_start; - ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); if (cpu) { - unsigned long p; - - ptr += subbuf_size * nr_subbufs; - - /* Save the beginning of this CPU chunk */ - p = ptr; - ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + ptr += subbuf_size * nr_subbufs; /* We can use multiplication to find chunks greater than 1 */ if (cpu > 1) { unsigned long size; + unsigned long p; + /* Save the beginning of this CPU chunk */ + p = ptr; + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); ptr += subbuf_size * nr_subbufs; /* Now all chunks after this are the same size */ size = ptr - p; ptr += size * (cpu - 2); - - ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); } } - if (ptr + subbuf_size * nr_subbufs > buffer->range_addr_end) + return (void *)ptr; +} + +/* Return the start of subbufs given the meta pointer */ +static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) +{ + int subbuf_size = meta->subbuf_size; + unsigned long ptr; + + ptr = (unsigned long)meta; + ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs); + + return (void *)ptr; +} + +/* + * Return a specific sub-buffer for a given @cpu defined by @idx. + */ +static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) +{ + struct ring_buffer_meta *meta; + unsigned long ptr; + int subbuf_size; + + meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu); + if (!meta) + return NULL; + + if (WARN_ON_ONCE(idx >= meta->nr_subbufs)) return NULL; + + subbuf_size = meta->subbuf_size; + + /* Map this buffer to the order that's in meta->buffers[] */ + idx = meta->buffers[idx]; + + ptr = (unsigned long)rb_subbufs_from_meta(meta); + + ptr += subbuf_size * idx; + if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end) + return NULL; + return (void *)ptr; } +static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) +{ + struct ring_buffer_meta *meta; + void *subbuf; + int cpu; + int i; + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + meta = rb_range_meta(buffer, nr_pages, cpu); + + meta->nr_subbufs = nr_pages + 1; + meta->subbuf_size = PAGE_SIZE; + + subbuf = rb_subbufs_from_meta(meta); + + /* + * The buffers[] array holds the order of the sub-buffers + * that are after the meta data. The sub-buffers may + * be swapped out when read and inserted into a different + * location of the ring buffer. Although their addresses + * remain the same, the buffers[] array contains the + * index into the sub-buffers holding their actual order. + */ + for (i = 0; i < meta->nr_subbufs; i++) { + meta->buffers[i] = i; + rb_init_page(subbuf); + subbuf += meta->subbuf_size; + } + } +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { @@ -1600,7 +1687,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, set_current_oom_origin(); for (i = 0; i < nr_pages; i++) { struct page *page; - int cpu = cpu_buffer->cpu; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), mflags, cpu_to_node(cpu_buffer->cpu)); @@ -1617,10 +1703,11 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) { /* A range was given. Use that for the buffer page */ - bpage->page = rb_range_buffer(buffer, cpu, nr_pages, i + 1); + bpage->page = rb_range_buffer(cpu_buffer, i + 1); if (!bpage->page) goto free_pages; bpage->range = 1; + bpage->id = i + 1; } else { page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags | __GFP_COMP | __GFP_ZERO, @@ -1628,9 +1715,9 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (!page) goto free_pages; bpage->page = page_address(page); + rb_init_page(bpage->page); } bpage->order = cpu_buffer->buffer->subbuf_order; - rb_init_page(bpage->page); if (user_thread && fatal_signal_pending(current)) goto free_pages; @@ -1711,7 +1798,13 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; if (buffer->range_addr_start) { - bpage->page = rb_range_buffer(buffer, cpu, nr_pages, 0); + /* + * Range mapped buffers have the same restrictions as memory + * mapped ones do. + */ + cpu_buffer->mapped = 1; + cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu); + bpage->page = rb_range_buffer(cpu_buffer, 0); if (!bpage->page) goto fail_free_reader; bpage->range = 1; @@ -1722,8 +1815,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (!page) goto fail_free_reader; bpage->page = page_address(page); + rb_init_page(bpage->page); } - rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); @@ -1737,6 +1830,10 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; rb_head_page_activate(cpu_buffer); + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->commit_buffer = meta->head_buffer; + } return cpu_buffer; @@ -1856,6 +1953,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages--; buffer->range_addr_start = start; buffer->range_addr_end = end; + + rb_range_meta_init(buffer, nr_pages); } else { /* need at least two pages */ @@ -2544,6 +2643,52 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->next_event = 0; } +/* Return the index into the sub-buffers for a given sub-buffer */ +static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) +{ + void *subbuf_array; + + subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs; + subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size); + return (subbuf - subbuf_array) / meta->subbuf_size; +} + +static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *next_page) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + unsigned long old_head = (unsigned long)next_page->page; + unsigned long new_head; + + rb_inc_page(&next_page); + new_head = (unsigned long)next_page->page; + + /* + * Only move it forward once, if something else came in and + * moved it forward, then we don't want to touch it. + */ + (void)cmpxchg(&meta->head_buffer, old_head, new_head); +} + +static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *reader) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + void *old_reader = cpu_buffer->reader_page->page; + void *new_reader = reader->page; + int id; + + id = reader->id; + cpu_buffer->reader_page->id = id; + reader->id = 0; + + meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader); + meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader); + + /* The head pointer is the one after the reader */ + rb_update_meta_head(cpu_buffer, reader); +} + /* * rb_handle_head_page - writer hit the head page * @@ -2593,6 +2738,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); + if (cpu_buffer->ring_meta) + rb_update_meta_head(cpu_buffer, next_page); /* * The entries will be zeroed out when we move the * tail page. @@ -3154,6 +3301,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; + } /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -4771,6 +4922,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (!ret) goto spin; + if (cpu_buffer->ring_meta) + rb_update_meta_reader(cpu_buffer, reader); + /* * Yay! We succeeded in replacing the page. * @@ -5451,11 +5605,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; - if (cpu_buffer->user_mapped) - rb_update_meta_page(cpu_buffer); - rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; + + if (cpu_buffer->mapped) { + rb_update_meta_page(cpu_buffer); + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->commit_buffer = meta->head_buffer; + } + } } /* Must have disabled the cpu buffer then done a synchronize_rcu */ From 2124de79adaa0a0bd1795f1996774e2e69157766 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:38 -0400 Subject: [PATCH 04/28] tracing: Implement creating an instance based on a given memory region Allow for creating a new instance by passing in an address and size to map the ring buffer for the instance to. This will allow features like a pstore memory mapped region to be used for an tracing instance ring buffer that can be retrieved from one boot to the next. Link: https://lkml.kernel.org/r/20240612232025.692086240@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 50 +++++++++++++++++++++++++++++++++++--------- kernel/trace/trace.h | 6 +++++- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 578a49ff5c32e..ff2b504fbe002 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4921,6 +4921,11 @@ static int tracing_open(struct inode *inode, struct file *file) static bool trace_ok_for_array(struct tracer *t, struct trace_array *tr) { +#ifdef CONFIG_TRACER_SNAPSHOT + /* arrays with mapped buffer range do not have snapshots */ + if (tr->range_addr_start && t->use_max_tr) + return false; +#endif return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; } @@ -8664,11 +8669,13 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) tr, cpu, &tracing_entries_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, - tr, cpu, &snapshot_fops); + if (!tr->range_addr_start) { + trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, + tr, cpu, &snapshot_fops); - trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, - tr, cpu, &snapshot_raw_fops); + trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, + tr, cpu, &snapshot_raw_fops); + } #endif } @@ -9205,7 +9212,18 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size buf->tr = tr; - buf->buffer = ring_buffer_alloc(size, rb_flags); + if (tr->range_addr_start && tr->range_addr_size) { + buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, + tr->range_addr_start, + tr->range_addr_size); + /* + * This is basically the same as a mapped buffer, + * with the same restrictions. + */ + tr->mapped++; + } else { + buf->buffer = ring_buffer_alloc(size, rb_flags); + } if (!buf->buffer) return -ENOMEM; @@ -9242,6 +9260,10 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return ret; #ifdef CONFIG_TRACER_MAX_TRACE + /* Fix mapped buffer trace arrays do not have snapshot buffers */ + if (tr->range_addr_start) + return 0; + ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { @@ -9342,7 +9364,9 @@ static int trace_array_create_dir(struct trace_array *tr) } static struct trace_array * -trace_array_create_systems(const char *name, const char *systems) +trace_array_create_systems(const char *name, const char *systems, + unsigned long range_addr_start, + unsigned long range_addr_size) { struct trace_array *tr; int ret; @@ -9368,6 +9392,10 @@ trace_array_create_systems(const char *name, const char *systems) goto out_free_tr; } + /* Only for boot up memory mapped ring buffers */ + tr->range_addr_start = range_addr_start; + tr->range_addr_size = range_addr_size; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9425,7 +9453,7 @@ trace_array_create_systems(const char *name, const char *systems) static struct trace_array *trace_array_create(const char *name) { - return trace_array_create_systems(name, NULL); + return trace_array_create_systems(name, NULL, 0, 0); } static int instance_mkdir(const char *name) @@ -9479,7 +9507,7 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system goto out_unlock; } - tr = trace_array_create_systems(name, systems); + tr = trace_array_create_systems(name, systems, 0, 0); if (IS_ERR(tr)) tr = NULL; @@ -9672,8 +9700,10 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) MEM_FAIL(1, "Could not allocate function filter files"); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, - tr, &snapshot_fops); + if (!tr->range_addr_start) { + trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, + tr, &snapshot_fops); + } #endif trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 749a182dab480..5dd48932509c7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -336,7 +336,6 @@ struct trace_array { bool allocated_snapshot; spinlock_t snapshot_trigger_lock; unsigned int snapshot; - unsigned int mapped; unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -344,6 +343,11 @@ struct trace_array { struct irq_work fsnotify_irqwork; #endif #endif + /* The below is for memory mapped ring buffer */ + unsigned int mapped; + unsigned long range_addr_start; + unsigned long range_addr_size; + struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; /* From 950032ffcee7a43c960951940059afc5a2bdbcbc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:39 -0400 Subject: [PATCH 05/28] ring-buffer: Add output of ring buffer meta page Add a buffer_meta per-cpu file for the trace instance that is mapped to boot memory. This shows the current meta-data and can be used by user space tools to record off the current mappings to help reconstruct the ring buffer after a reboot. It does not expose any virtual addresses, just indexes into the sub-buffer pages. Link: https://lkml.kernel.org/r/20240612232025.854471446@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 77 ++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.c | 30 ++++++++++++++- kernel/trace/trace.h | 2 + 3 files changed, 107 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0e3ed7f1cc5df..25b0e61e8c766 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -32,6 +32,8 @@ #include #include +#include "trace.h" + /* * The "absolute" timestamp in the buffer is only 59 bits. * If a clock has the 5 MSBs set, it needs to be saved and @@ -1647,6 +1649,81 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) } } +static void *rbm_start(struct seq_file *m, loff_t *pos) +{ + struct ring_buffer_per_cpu *cpu_buffer = m->private; + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + unsigned long val; + + if (!meta) + return NULL; + + if (*pos > meta->nr_subbufs) + return NULL; + + val = *pos; + val++; + + return (void *)val; +} + +static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + + return rbm_start(m, pos); +} + +static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); + +static int rbm_show(struct seq_file *m, void *v) +{ + struct ring_buffer_per_cpu *cpu_buffer = m->private; + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + unsigned long val = (unsigned long)v; + + if (val == 1) { + seq_printf(m, "head_buffer: %d\n", + rb_meta_subbuf_idx(meta, (void *)meta->head_buffer)); + seq_printf(m, "commit_buffer: %d\n", + rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer)); + seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size); + seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs); + return 0; + } + + val -= 2; + seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]); + + return 0; +} + +static void rbm_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations rb_meta_seq_ops = { + .start = rbm_start, + .next = rbm_next, + .show = rbm_show, + .stop = rbm_stop, +}; + +int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &rb_meta_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = buffer->buffers[cpu]; + + return 0; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff2b504fbe002..622fe670949d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5018,7 +5018,7 @@ static int show_traces_open(struct inode *inode, struct file *file) return 0; } -static int show_traces_release(struct inode *inode, struct file *file) +static int tracing_seq_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -5059,7 +5059,7 @@ static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .llseek = seq_lseek, - .release = show_traces_release, + .release = tracing_seq_release, }; static ssize_t @@ -6860,6 +6860,22 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } +static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu); + if (ret < 0) + __trace_array_put(tr); + return ret; +} + static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -7436,6 +7452,13 @@ static const struct file_operations tracing_entries_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_buffer_meta_fops = { + .open = tracing_buffer_meta_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_seq_release, +}; + static const struct file_operations tracing_total_entries_fops = { .open = tracing_open_generic_tr, .read = tracing_total_entries_read, @@ -8668,6 +8691,9 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_entries_fops); + if (tr->range_addr_start) + trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_buffer_meta_fops); #ifdef CONFIG_TRACER_SNAPSHOT if (!tr->range_addr_start) { trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5dd48932509c7..3e56d3b222126 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -645,6 +645,8 @@ trace_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long len, unsigned int trace_ctx); +int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu); + struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); From c76883f18e59b762247ee91d3e4224231711854e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:40 -0400 Subject: [PATCH 06/28] ring-buffer: Add test if range of boot buffer is valid Add a test against the ring buffer memory range to see if it has valid data. The ring_buffer_meta structure is given a new field called "first_buffer" which holds the address of the first sub-buffer. This is used to both determine if the other fields are valid as well as finding the offset between the old addresses of the sub-buffer from the previous boot to the new addresses of the current boot. Since the values for nr_subbufs and subbuf_size is to be the same, check if the values in the meta page match the values calculated. Take the range of the first_buffer and the total size of all the buffers and make sure the saved head_buffer and commit_buffer fall in the range. Iterate through all the sub-buffers to make sure that the values in the sub-buffer "commit" field (the field that holds the amount of data on the sub-buffer) is within the end of the sub-buffer. Also check the index array to make sure that all the indexes are within nr_subbufs. Link: https://lkml.kernel.org/r/20240612232026.013843655@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 143 ++++++++++++++++++++++++++++++++++--- 1 file changed, 135 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 25b0e61e8c766..588bc057bad78 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -45,6 +45,7 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { + unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; __u32 subbuf_size; @@ -1618,21 +1619,103 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) return (void *)ptr; } +/* + * See if the existing memory contains valid ring buffer data. + * As the previous kernel must be the same as this kernel, all + * the calculations (size of buffers and number of buffers) + * must be the same. + */ +static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, + struct trace_buffer *buffer, int nr_pages) +{ + int subbuf_size = PAGE_SIZE; + struct buffer_data_page *subbuf; + unsigned long buffers_start; + unsigned long buffers_end; + int i; + + /* The subbuffer's size and number of subbuffers must match */ + if (meta->subbuf_size != subbuf_size || + meta->nr_subbufs != nr_pages + 1) { + pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); + return false; + } + + buffers_start = meta->first_buffer; + buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); + + /* Is the head and commit buffers within the range of buffers? */ + if (meta->head_buffer < buffers_start || + meta->head_buffer >= buffers_end) { + pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu); + return false; + } + + if (meta->commit_buffer < buffers_start || + meta->commit_buffer >= buffers_end) { + pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu); + return false; + } + + subbuf = rb_subbufs_from_meta(meta); + + /* Is the meta buffers and the subbufs themselves have correct data? */ + for (i = 0; i < meta->nr_subbufs; i++) { + if (meta->buffers[i] < 0 || + meta->buffers[i] >= meta->nr_subbufs) { + pr_info("Ring buffer boot meta [%d] array out of range\n", cpu); + return false; + } + + if ((unsigned)local_read(&subbuf->commit) > subbuf_size) { + pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu); + return false; + } + + subbuf = (void *)subbuf + subbuf_size; + } + + pr_info("Ring buffer meta is from previous boot!\n"); + return true; +} + static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; + unsigned long delta; void *subbuf; int cpu; int i; for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + void *next_meta; + meta = rb_range_meta(buffer, nr_pages, cpu); + if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { + /* Make the mappings match the current address */ + subbuf = rb_subbufs_from_meta(meta); + delta = (unsigned long)subbuf - meta->first_buffer; + meta->first_buffer += delta; + meta->head_buffer += delta; + meta->commit_buffer += delta; + continue; + } + + if (cpu < nr_cpu_ids - 1) + next_meta = rb_range_meta(buffer, nr_pages, cpu + 1); + else + next_meta = (void *)buffer->range_addr_end; + + memset(meta, 0, next_meta - (void *)meta); + meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; subbuf = rb_subbufs_from_meta(meta); + meta->first_buffer = (unsigned long)subbuf; + /* * The buffers[] array holds the order of the sub-buffers * that are after the meta data. The sub-buffers may @@ -1724,10 +1807,26 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in return 0; } +/* Map the buffer_pages to the previous head and commit pages */ +static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + + if (meta->head_buffer == (unsigned long)bpage->page) + cpu_buffer->head_page = bpage; + + if (meta->commit_buffer == (unsigned long)bpage->page) { + cpu_buffer->commit_page = bpage; + cpu_buffer->tail_page = bpage; + } +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; + struct ring_buffer_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; @@ -1762,6 +1861,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, */ if (user_thread) set_current_oom_origin(); + + if (buffer->range_addr_start) + meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + for (i = 0; i < nr_pages; i++) { struct page *page; @@ -1778,11 +1881,14 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, */ list_add_tail(&bpage->list, pages); - if (buffer->range_addr_start) { + if (meta) { /* A range was given. Use that for the buffer page */ bpage->page = rb_range_buffer(cpu_buffer, i + 1); if (!bpage->page) goto free_pages; + /* If this is valid from a previous boot */ + if (meta->head_buffer) + rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; } else { @@ -1844,6 +1950,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_meta *meta; struct buffer_page *bpage; struct page *page; int ret; @@ -1884,6 +1991,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) bpage->page = rb_range_buffer(cpu_buffer, 0); if (!bpage->page) goto fail_free_reader; + if (cpu_buffer->ring_meta->head_buffer) + rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; } else { page = alloc_pages_node(cpu_to_node(cpu), @@ -1902,14 +2011,32 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (ret < 0) goto fail_free_reader; - cpu_buffer->head_page - = list_entry(cpu_buffer->pages, struct buffer_page, list); - cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + /* If the boot meta was valid then this has already been updated */ + meta = cpu_buffer->ring_meta; + if (!meta || !meta->head_buffer || + !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) { + if (meta && meta->head_buffer && + (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) { + pr_warn("Ring buffer meta buffers not all mapped\n"); + if (!cpu_buffer->head_page) + pr_warn(" Missing head_page\n"); + if (!cpu_buffer->commit_page) + pr_warn(" Missing commit_page\n"); + if (!cpu_buffer->tail_page) + pr_warn(" Missing tail_page\n"); + } - rb_head_page_activate(cpu_buffer); - if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; - meta->commit_buffer = meta->head_buffer; + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + rb_head_page_activate(cpu_buffer); + + if (cpu_buffer->ring_meta) + meta->commit_buffer = meta->head_buffer; + } else { + /* The valid meta buffer still needs to activate the head page */ + rb_head_page_activate(cpu_buffer); } return cpu_buffer; From 5f3b6e839f3ceb8d6ef02231ba9b5aca71b8bf55 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:41 -0400 Subject: [PATCH 07/28] ring-buffer: Validate boot range memory events Make sure all the events in each of the sub-buffers that were mapped in a memory region are valid. This moves the code that walks the buffers for time-stamp validation out of the CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS ifdef block and is used to validate the content. Only the ring buffer event meta data and time stamps are checked and not the data load. This also has a second purpose. The buffer_page structure that points to the data sub-buffers has accounting that keeps track of the number of events that are on the sub-buffer. This updates that counter as well. That counter is used in reading the buffer and knowing if the ring buffer is empty or not. Link: https://lkml.kernel.org/r/20240612232026.172503570@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 190 +++++++++++++++++++++++++++++-------- 1 file changed, 152 insertions(+), 38 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 588bc057bad78..804dfbdeef84f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1675,10 +1675,152 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, subbuf = (void *)subbuf + subbuf_size; } - pr_info("Ring buffer meta is from previous boot!\n"); return true; } +static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); + +static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, + unsigned long long *timestamp, u64 *delta_ptr) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int events = 0; + int e; + + *delta_ptr = 0; + *timestamp = 0; + + ts = dpage->time_stamp; + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(dpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = rb_event_time_stamp(event); + ts += delta; + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, ts); + if (delta < ts) { + *delta_ptr = delta; + *timestamp = ts; + return -1; + } + ts = delta; + break; + + case RINGBUF_TYPE_PADDING: + if (event->time_delta == 1) + break; + fallthrough; + case RINGBUF_TYPE_DATA: + events++; + ts += event->time_delta; + break; + + default: + return -1; + } + } + *timestamp = ts; + return events; +} + +static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) +{ + unsigned long long ts; + u64 delta; + int tail; + + tail = local_read(&dpage->commit); + return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta); +} + +/* If the meta data has been validated, now validate the events */ +static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct buffer_page *head_page; + unsigned long entry_bytes = 0; + unsigned long entries = 0; + int ret; + int i; + + if (!meta || !meta->head_buffer) + return; + + /* Do the reader page first */ + ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); + if (ret < 0) { + pr_info("Ring buffer reader page is invalid\n"); + goto invalid; + } + entries += ret; + entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); + local_set(&cpu_buffer->reader_page->entries, ret); + + head_page = cpu_buffer->head_page; + + /* If both the head and commit are on the reader_page then we are done. */ + if (head_page == cpu_buffer->reader_page && + head_page == cpu_buffer->commit_page) + goto done; + + /* Iterate until finding the commit page */ + for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { + + /* Reader page has already been done */ + if (head_page == cpu_buffer->reader_page) + continue; + + ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); + if (ret < 0) { + pr_info("Ring buffer meta [%d] invalid buffer page\n", + cpu_buffer->cpu); + goto invalid; + } + entries += ret; + entry_bytes += local_read(&head_page->page->commit); + local_set(&cpu_buffer->head_page->entries, ret); + + if (head_page == cpu_buffer->commit_page) + break; + } + + if (head_page != cpu_buffer->commit_page) { + pr_info("Ring buffer meta [%d] commit page not found\n", + cpu_buffer->cpu); + goto invalid; + } + done: + local_set(&cpu_buffer->entries, entries); + local_set(&cpu_buffer->entries_bytes, entry_bytes); + + pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu); + return; + + invalid: + /* The content of the buffers are invalid, reset the meta data */ + meta->head_buffer = 0; + meta->commit_buffer = 0; + + /* Reset the reader page */ + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); + + /* Reset all the subbuffers */ + for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) { + local_set(&head_page->entries, 0); + local_set(&head_page->page->commit, 0); + } +} + static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; @@ -1757,8 +1899,6 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) return rbm_start(m, pos); } -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); - static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; @@ -2011,6 +2151,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (ret < 0) goto fail_free_reader; + rb_meta_validate_events(cpu_buffer); + /* If the boot meta was valid then this has already been updated */ meta = cpu_buffer->ring_meta; if (!meta || !meta->head_buffer || @@ -3955,11 +4097,10 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { - struct ring_buffer_event *event; struct buffer_data_page *bpage; u64 ts, delta; bool full = false; - int e; + int ret; bpage = info->tail_page->page; @@ -3985,39 +4126,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; - ts = bpage->time_stamp; - - for (e = 0; e < tail; e += rb_event_length(event)) { - - event = (struct ring_buffer_event *)(bpage->data + e); - - switch (event->type_len) { - - case RINGBUF_TYPE_TIME_EXTEND: - delta = rb_event_time_stamp(event); - ts += delta; - break; - - case RINGBUF_TYPE_TIME_STAMP: - delta = rb_event_time_stamp(event); - delta = rb_fix_abs_ts(delta, ts); - if (delta < ts) { - buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", - cpu_buffer->cpu, ts, delta); - } - ts = delta; - break; - - case RINGBUF_TYPE_PADDING: - if (event->time_delta == 1) - break; - fallthrough; - case RINGBUF_TYPE_DATA: - ts += event->time_delta; - break; - - default: - RB_WARN_ON(cpu_buffer, 1); + ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); + if (ret < 0) { + if (delta < ts) { + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", + cpu_buffer->cpu, ts, delta); + goto out; } } if ((full && ts > info->ts) || From e645535a954ad5eb53ed2c5752c534a3371547c3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:42 -0400 Subject: [PATCH 08/28] tracing: Add option to use memmapped memory for trace boot instance Add an option to the trace_instance kernel command line parameter that allows it to use the reserved memory from memmap boot parameter. memmap=12M$0x284500000 trace_instance=boot_mapped@0x284500000:12M The above will reserves 12 megs at the physical address 0x284500000. The second parameter will create a "boot_mapped" instance and use the memory reserved as the memory for the ring buffer. That will create an instance called "boot_mapped": /sys/kernel/tracing/instances/boot_mapped Note, because the ring buffer is using a defined memory ranged, it will act just like a memory mapped ring buffer. It will not have a snapshot buffer, as it can't swap out the buffer. The snapshot files as well as any tracers that uses a snapshot will not be present in the boot_mapped instance. Link: https://lkml.kernel.org/r/20240612232026.329660169@goodmis.org Cc: linux-mm@kvack.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- .../admin-guide/kernel-parameters.txt | 9 +++ kernel/trace/trace.c | 75 +++++++++++++++++-- 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b600df82669db..ff26b6094e793 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6754,6 +6754,15 @@ the same thing would happen if it was left off). The irq_handler_entry event, and all events under the "initcall" system. + If memory has been reserved (see memmap for x86), the instance + can use that memory: + + memmap=12M$0x284500000 trace_instance=boot_map@0x284500000:12M + + The above will create a "boot_map" instance that uses the physical + memory at 0x284500000 that is 12Megs. The per CPU buffers of that + instance will be split up accordingly. + trace_options=[option-list] [FTRACE] Enable or disable tracer options at boot. The option-list is a comma delimited list of options diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 622fe670949d7..dfde26aa32118 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9504,6 +9504,31 @@ static int instance_mkdir(const char *name) return ret; } +static u64 map_pages(u64 start, u64 size) +{ + struct page **pages; + phys_addr_t page_start; + unsigned int page_count; + unsigned int i; + void *vaddr; + + page_count = DIV_ROUND_UP(size, PAGE_SIZE); + + page_start = start; + pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return 0; + + for (i = 0; i < page_count; i++) { + phys_addr_t addr = page_start + i * PAGE_SIZE; + pages[i] = pfn_to_page(addr >> PAGE_SHIFT); + } + vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL); + kfree(pages); + + return (u64)(unsigned long)vaddr; +} + /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. @@ -10350,6 +10375,7 @@ __init static void enable_instances(void) { struct trace_array *tr; char *curr_str; + char *name; char *str; char *tok; @@ -10358,19 +10384,56 @@ __init static void enable_instances(void) str = boot_instance_info; while ((curr_str = strsep(&str, "\t"))) { + unsigned long start = 0; + unsigned long size = 0; + unsigned long addr = 0; tok = strsep(&curr_str, ","); + name = strsep(&tok, "@"); + if (tok) { + start = memparse(tok, &tok); + if (!start) { + pr_warn("Tracing: Invalid boot instance address for %s\n", + name); + continue; + } + } - if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) - do_allocate_snapshot(tok); + if (start) { + if (*tok != ':') { + pr_warn("Tracing: No size specified for instance %s\n", name); + continue; + } + tok++; + size = memparse(tok, &tok); + if (!size) { + pr_warn("Tracing: Invalid boot instance size for %s\n", + name); + continue; + } + addr = map_pages(start, size); + if (addr) { + pr_info("Tracing: mapped boot instance %s at physical memory 0x%lx of size 0x%lx\n", + name, start, size); + } else { + pr_warn("Tracing: Failed to map boot instance %s\n", name); + continue; + } + } else { + /* Only non mapped buffers have snapshot buffers */ + if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) + do_allocate_snapshot(name); + } - tr = trace_array_get_by_name(tok, NULL); + tr = trace_array_create_systems(name, NULL, addr, size); if (!tr) { - pr_warn("Failed to create instance buffer %s\n", curr_str); + pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str); continue; } - /* Allow user space to delete it */ - trace_array_put(tr); + + /* Only allow non mapped buffers to be deleted */ + if (!start) + trace_array_put(tr); while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); From 8f3e6659656e63bd710fcab4f0cdfb8608bc1b96 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:43 -0400 Subject: [PATCH 09/28] ring-buffer: Save text and data locations in mapped meta data When a ring buffer is mapped to a specific address, save the address of a text function and some data. This will be used to determine the delta between the last boot and the current boot for pointers to functions as well as to data. Link: https://lkml.kernel.org/r/20240612232026.496176678@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 804dfbdeef84f..8c287430411d9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -45,6 +45,8 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { + unsigned long text_addr; + unsigned long data_addr; unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; @@ -542,6 +544,9 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + long last_text_delta; + long last_data_delta; + unsigned int subbuf_size; unsigned int subbuf_order; unsigned int max_data_size; @@ -1821,10 +1826,15 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) } } +/* Used to calculate data delta */ +static char rb_data_ptr[] = ""; + static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; unsigned long delta; + unsigned long this_text = (unsigned long)rb_range_meta_init; + unsigned long this_data = (unsigned long)rb_data_ptr; void *subbuf; int cpu; int i; @@ -1841,6 +1851,10 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; + buffer->last_text_delta = this_text - meta->text_addr; + buffer->last_data_delta = this_data - meta->data_addr; + meta->text_addr = this_text; + meta->data_addr = this_data; continue; } @@ -1857,6 +1871,8 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; + meta->text_addr = this_text; + meta->data_addr = this_data; /* * The buffers[] array holds the order of the sub-buffers From 7a1d1e4b9639ff08b2f42605c2950ae1ba4a43bf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:44 -0400 Subject: [PATCH 10/28] tracing/ring-buffer: Add last_boot_info file to boot instance If an instance is mapped to memory on boot up, create a new file called "last_boot_info" that will hold information that can be used to properly parse the raw data in the ring buffer. It will export the delta of the addresses for text and data from what it was from the last boot. It does not expose actually addresses (unless you knew what the actual address was from the last boot). The output will look like: # cat last_boot_info text delta: -268435456 data delta: -268435456 The text and data are kept separate in case they are ever made different. Link: https://lkml.kernel.org/r/20240612232026.658680738@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 3 +++ kernel/trace/ring_buffer.c | 23 ++++++++++++++++++ kernel/trace/trace.c | 47 ++++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 2 ++ 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index a50b0223b1d32..55de3798a9b98 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -94,6 +94,9 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag unsigned long range_size, struct lock_class_key *key); +bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, + long *data); + /* * Because the ring buffer is generic, if other users of the ring buffer get * traced by ftrace, it can produce lockdep warnings. We need to keep each diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8c287430411d9..f3d772461a60f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2396,6 +2396,29 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag return alloc_buffer(size, flags, order, start, start + range_size, key); } +/** + * ring_buffer_last_boot_delta - return the delta offset from last boot + * @buffer: The buffer to return the delta from + * @text: Return text delta + * @data: Return data delta + * + * Returns: The true if the delta is non zero + */ +bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, + long *data) +{ + if (!buffer) + return false; + + if (!buffer->last_text_delta) + return false; + + *text = buffer->last_text_delta; + *data = buffer->last_data_delta; + + return true; +} + /** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dfde26aa32118..dc4eee33d920c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6041,6 +6041,18 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return ret; } +static void update_last_data(struct trace_array *tr) +{ + if (!tr->text_delta && !tr->data_delta) + return; + + /* Clear old data */ + tracing_reset_online_cpus(&tr->array_buffer); + + /* Using current data now */ + tr->text_delta = 0; + tr->data_delta = 0; +} /** * tracing_update_buffers - used by tracing facility to expand ring buffers @@ -6058,6 +6070,9 @@ int tracing_update_buffers(struct trace_array *tr) int ret = 0; mutex_lock(&trace_types_lock); + + update_last_data(tr); + if (!tr->ring_buffer_expanded) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); @@ -6113,6 +6128,8 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) mutex_lock(&trace_types_lock); + update_last_data(tr); + if (!tr->ring_buffer_expanded) { ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); @@ -6860,6 +6877,21 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } +static ssize_t +tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + struct seq_buf seq; + char buf[64]; + + seq_buf_init(&seq, buf, 64); + + seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta); + seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq)); +} + static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -7499,6 +7531,13 @@ static const struct file_operations trace_time_stamp_mode_fops = { .release = tracing_single_release_tr, }; +static const struct file_operations last_boot_fops = { + .open = tracing_open_generic_tr, + .read = tracing_last_boot_read, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + #ifdef CONFIG_TRACER_SNAPSHOT static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, @@ -9242,6 +9281,9 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, tr->range_addr_size); + + ring_buffer_last_boot_delta(buf->buffer, + &tr->text_delta, &tr->data_delta); /* * This is basically the same as a mapped buffer, * with the same restrictions. @@ -9751,7 +9793,10 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) MEM_FAIL(1, "Could not allocate function filter files"); #ifdef CONFIG_TRACER_SNAPSHOT - if (!tr->range_addr_start) { + if (tr->range_addr_start) { + trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, + tr, &last_boot_fops); + } else { trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3e56d3b222126..3dc5c8f14ce94 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -347,6 +347,8 @@ struct trace_array { unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; + long text_delta; + long data_delta; struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; From 07714b4bb3f9800261c8b4b2f47e9010ed60979d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:45 -0400 Subject: [PATCH 11/28] tracing: Handle old buffer mappings for event strings and functions Use the saved text_delta and data_delta of a persistent memory mapped ring buffer that was saved from a previous boot, and use the delta in the trace event print output so that strings and functions show up normally. That is, for an event like trace_kmalloc() that prints the callsite via "%pS", if it used the address saved in the ring buffer it will not match the function that was saved in the previous boot if the kernel remaps itself between boots. For RCU events that point to saved static strings where only the address of the string is saved in the ring buffer, it too will be adjusted to point to where the string is on the current boot. Link: https://lkml.kernel.org/r/20240612232026.821020753@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc4eee33d920c..71cca10581d6c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3671,8 +3671,11 @@ static void test_can_verify(void) void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, va_list ap) { + long text_delta = iter->tr->text_delta; + long data_delta = iter->tr->data_delta; const char *p = fmt; const char *str; + bool good; int i, j; if (WARN_ON_ONCE(!fmt)) @@ -3691,7 +3694,10 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, j = 0; - /* We only care about %s and variants */ + /* + * We only care about %s and variants + * as well as %p[sS] if delta is non-zero + */ for (i = 0; p[i]; i++) { if (i + 1 >= iter->fmt_size) { /* @@ -3720,6 +3726,11 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, } if (p[i+j] == 's') break; + + if (text_delta && p[i+1] == 'p' && + ((p[i+2] == 's' || p[i+2] == 'S'))) + break; + star = false; } j = 0; @@ -3733,6 +3744,24 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, iter->fmt[i] = '\0'; trace_seq_vprintf(&iter->seq, iter->fmt, ap); + /* Add delta to %pS pointers */ + if (p[i+1] == 'p') { + unsigned long addr; + char fmt[4]; + + fmt[0] = '%'; + fmt[1] = 'p'; + fmt[2] = p[i+2]; /* Either %ps or %pS */ + fmt[3] = '\0'; + + addr = va_arg(ap, unsigned long); + addr += text_delta; + trace_seq_printf(&iter->seq, fmt, (void *)addr); + + p += i + 3; + continue; + } + /* * If iter->seq is full, the above call no longer guarantees * that ap is in sync with fmt processing, and further calls @@ -3751,6 +3780,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, /* The ap now points to the string data of the %s */ str = va_arg(ap, const char *); + good = trace_safe_str(iter, str, star, len); + + /* Could be from the last boot */ + if (data_delta && !good) { + str += data_delta; + good = trace_safe_str(iter, str, star, len); + } + /* * If you hit this warning, it is likely that the * trace event in question used %s on a string that @@ -3760,8 +3797,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, * instead. See samples/trace_events/trace-events-sample.h * for reference. */ - if (WARN_ONCE(!trace_safe_str(iter, str, star, len), - "fmt: '%s' current_buffer: '%s'", + if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'", fmt, seq_buf_str(&iter->seq.seq))) { int ret; From 7cfeb9033dd1fbdaacd74b2e6613d7366f515e16 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:46 -0400 Subject: [PATCH 12/28] tracing: Update function tracing output for previous boot buffer For a persistent ring buffer that is saved across boots, if function tracing was performed in the previous boot, it only saves the address of the functions and uses "%pS" to print their names. But the current boot, those functions may be in different locations. The persistent meta-data saves the text delta between the two boots and can be used to find the address of the saved function of where it is located in the current boot. Link: https://lkml.kernel.org/r/20240612232026.988226055@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d8b302d010830..b9d2c64c06484 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -990,8 +990,11 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, int flags) + unsigned long parent_ip, long delta, int flags) { + ip += delta; + parent_ip += delta; + seq_print_ip_sym(s, ip, flags); if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { @@ -1009,7 +1012,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, flags); + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1674,7 +1677,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, flags); + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); From a62b4f6fbdffa8e90959da485b68f844241d300f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:47 -0400 Subject: [PATCH 13/28] tracing: Add last boot delta offset for stack traces The addresses of a stack trace event are relative to the kallsyms. As that can change between boots, when printing the stack trace from a buffer that was from the last boot, it needs all the addresses to be added to the "text_delta" that gives the delta between the addresses of the functions for the current boot compared to the address of the last boot. Then it can be passed to kallsyms to find the function name, otherwise it just shows a useless list of addresses. Link: https://lkml.kernel.org/r/20240612232027.145807384@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b9d2c64c06484..48de935988972 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1233,6 +1233,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; unsigned long *p; unsigned long *end; + long delta = iter->tr->text_delta; trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); @@ -1245,7 +1246,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, break; trace_seq_puts(s, " => "); - seq_print_ip_sym(s, *p, flags); + seq_print_ip_sym(s, (*p) + delta, flags); trace_seq_putc(s, '\n'); } From 94dfa500e7deddceff22768bb994f0fa67bd2fd0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 20 Jun 2024 11:49:57 +0300 Subject: [PATCH 14/28] tracing: Fix NULL vs IS_ERR() check in enable_instances() The trace_array_create_systems() function returns error pointers, not NULL. Fix the check to match. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: e645535a954a ("tracing: Add option to use memmapped memory for trace boot instance") Link: https://lore.kernel.org/9b23ea03-d709-435f-a309-461c3d747457@moroto.mountain Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 71cca10581d6c..5462fb10ff64b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10507,7 +10507,7 @@ __init static void enable_instances(void) } tr = trace_array_create_systems(name, NULL, addr, size); - if (!tr) { + if (IS_ERR(tr)) { pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str); continue; } From b96c312551b241bc17226c5347c6d6b38a1efd3e Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 9 Jul 2024 23:56:58 +0200 Subject: [PATCH 15/28] ring-buffer: Use vma_pages() helper function Use the vma_pages() helper function and fix the following Coccinelle/coccicheck warning reported by vma_pages.cocci: WARNING: Consider using vma_pages helper on vma Rename the local variable vma_pages accordingly. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/20240709215657.322071-2-thorsten.blum@toblux.com Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f3d772461a60f..d59cf0023e387 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6920,7 +6920,7 @@ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct vm_area_struct *vma) { - unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; + unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; struct page **pages; int p = 0, s = 0; @@ -6946,11 +6946,11 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */ - vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - if (!vma_pages || vma_pages > nr_pages) + nr_vma_pages = vma_pages(vma); + if (!nr_vma_pages || nr_vma_pages > nr_pages) return -EINVAL; - nr_pages = vma_pages; + nr_pages = nr_vma_pages; pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); if (!pages) From 6d02eefecc5e3887eea08f3df5e0f2af6eb35447 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 19 Jul 2024 10:13:12 -0400 Subject: [PATCH 16/28] tracing: Fix ifdef of snapshots to not prevent last_boot_info file The mapping of the ring buffer to memory allocated at boot up will also expose a "last_boot_info" to help tooling to read the raw data from the last boot. As instances that have their ring buffer mapped to fixed memory cannot perform snapshots, they can either have the "snapshot" file or the "last_boot_info" file, but not both. The code that added the "last_boot_info" file failed to notice that the "snapshot" creation was inside a "#ifdef CONFIG_TRACER_SNAPSHOT" and incorrectly placed the creation of the "last_boot_info" file within the ifdef block. Not only does it cause a warning when CONFIG_TRACER_SNAPSHOT is not enabled, it also incorrectly prevents the file from appearing. Link: https://lore.kernel.org/all/20240719102640.718554-1-arnd@kernel.org/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Reported-by: Arnd Bergmann Link: https://lore.kernel.org/20240719101312.3d4ac707@rorschach.local.home Fixes: 7a1d1e4b9639 ("tracing/ring-buffer: Add last_boot_info file to boot instance") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d0af984a5337d..8e5a4ca9fd70e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9828,15 +9828,15 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); -#ifdef CONFIG_TRACER_SNAPSHOT if (tr->range_addr_start) { trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, tr, &last_boot_fops); +#ifdef CONFIG_TRACER_SNAPSHOT } else { trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); - } #endif + } trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); From 29a02ec66556ac942d263416c32b97f5b9206f69 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 15 Aug 2024 08:28:11 -0400 Subject: [PATCH 17/28] tracing: Allow boot instances to use reserve_mem boot memory Allow boot instances to use memory reserved by the reserve_mem boot option. reserve_mem=12M:4096:trace trace_instance=boot_mapped@trace The above will allocate 12 megs with 4096 alignment and label it "trace". The second parameter will create a "boot_mapped" instance and use the memory reserved and labeled as "trace" as the memory for the ring buffer. That will create an instance called "boot_mapped": /sys/kernel/tracing/instances/boot_mapped Note, because the ring buffer is using a defined memory ranged, it will act just like a memory mapped ring buffer. It will not have a snapshot buffer, as it can't swap out the buffer. The snapshot files as well as any tracers that uses a snapshot will not be present in the boot_mapped instance. Also note that reserve_mem is not reliable in acquiring the same physical memory at each soft reboot. It is possible that KALSR could map the kernel at the previous boot memory location forcing the reserve_mem to return a different memory location. In this case, the previous ring buffer will be lost. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Linus Torvalds Cc: Ross Zwisler Cc: Vincent Donnefort Link: https://lore.kernel.org/20240815082811.669f7d8c@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- .../admin-guide/kernel-parameters.txt | 13 +++++++++++ kernel/trace/trace.c | 23 ++++++++++++------- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 35b5928233387..388653448e728 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6752,6 +6752,19 @@ memory at 0x284500000 that is 12Megs. The per CPU buffers of that instance will be split up accordingly. + Alternatively, the memory can be reserved by the reserve_mem option: + + reserve_mem=12M:4096:trace trace_instance=boot_map@trace + + This will reserve 12 megabytes at boot up with a 4096 byte alignment + and place the ring buffer in this memory. Note that due to KASLR, the + memory may not be the same location each time, which will not preserve + the buffer content. + + Also note that the layout of the ring buffer data may change between + kernel versions where the validator will fail and reset the ring buffer + if the layout is not the same as the previous kernel. + trace_options=[option-list] [FTRACE] Enable or disable tracer options at boot. The option-list is a comma delimited list of options diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8e5a4ca9fd70e..9bcef199ae90a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10465,22 +10465,20 @@ __init static void enable_instances(void) str = boot_instance_info; while ((curr_str = strsep(&str, "\t"))) { - unsigned long start = 0; - unsigned long size = 0; + phys_addr_t start = 0; + phys_addr_t size = 0; unsigned long addr = 0; tok = strsep(&curr_str, ","); name = strsep(&tok, "@"); - if (tok) { + + if (tok && isdigit(*tok)) { start = memparse(tok, &tok); if (!start) { pr_warn("Tracing: Invalid boot instance address for %s\n", name); continue; } - } - - if (start) { if (*tok != ':') { pr_warn("Tracing: No size specified for instance %s\n", name); continue; @@ -10492,10 +10490,19 @@ __init static void enable_instances(void) name); continue; } + } else if (tok) { + if (!reserve_mem_find_by_name(tok, &start, &size)) { + start = 0; + pr_warn("Failed to map boot instance %s to %s\n", name, tok); + continue; + } + } + + if (start) { addr = map_pages(start, size); if (addr) { - pr_info("Tracing: mapped boot instance %s at physical memory 0x%lx of size 0x%lx\n", - name, start, size); + pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", + name, &start, (unsigned long)size); } else { pr_warn("Tracing: Failed to map boot instance %s\n", name); continue; From 4c57d0be528b1255abe61d8277f3213d4d22e85f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 13 Aug 2024 13:11:06 -0400 Subject: [PATCH 18/28] tracing/fgraph: Have fgraph handle previous boot function addresses Update the function graph code to modify the function addresses for a previous boot buffer so that it matches the current kallsyms (note this does not handle module addresses, yet). After a reboot, instead of seeing: # trace-cmd show -B boot_mapped | tail -n30 swapper/0-1 [000] d..2. 56.286470: 0) 0.481 us | 0xffffffff925da5c4(); swapper/0-1 [000] d.... 56.286471: 0) 4.065 us | } swapper/0-1 [000] d.... 56.286471: 0) 4.920 us | } swapper/0-1 [000] d..1. 56.286472: 0) | 0xffffffff92536254() { swapper/0-1 [000] d..1. 56.286472: 0) + 28.974 us | 0xffffffff92534e30(); swapper/0-1 [000] d.... 56.286516: 0) + 43.881 us | } swapper/0-1 [000] d..1. 56.286517: 0) | 0xffffffff925136c4() { swapper/0-1 [000] d..1. 56.286518: 0) | 0xffffffff92514a14() { swapper/0-1 [000] d..1. 56.286518: 0) 6.003 us | 0xffffffff92514200(); swapper/0-1 [000] d.... 56.286529: 0) + 11.510 us | } swapper/0-1 [000] d.... 56.286529: 0) + 12.895 us | } swapper/0-1 [000] d.... 56.286530: 0) ! 382.884 us | } swapper/0-1 [000] d..1. 56.286530: 0) | 0xffffffff92536444() { swapper/0-1 [000] d..1. 56.286531: 0) | 0xffffffff92536254() { swapper/0-1 [000] d..1. 56.286531: 0) + 26.335 us | 0xffffffff92534e30(); swapper/0-1 [000] d.... 56.286560: 0) + 29.511 us | } swapper/0-1 [000] d.... 56.286561: 0) + 30.452 us | } swapper/0-1 [000] d..1. 56.286562: 0) | 0xffffffff9253c014() { swapper/0-1 [000] d..1. 56.286562: 0) | 0xffffffff9253bed4() { swapper/0-1 [000] d..1. 56.286563: 0) + 13.465 us | 0xffffffff92536684(); swapper/0-1 [000] d.... 56.286577: 0) + 14.651 us | } swapper/0-1 [000] d.... 56.286577: 0) + 15.821 us | } swapper/0-1 [000] d..1. 56.286578: 0) 0.667 us | 0xffffffff92547074(); swapper/0-1 [000] d..1. 56.286579: 0) 0.453 us | 0xffffffff924f35c4(); swapper/0-1 [000] d.... 56.286580: 0) # 3906.348 us | } swapper/0-1 [000] d..1. 56.286581: 0) | 0xffffffff92531a14() { swapper/0-1 [000] d..1. 56.286581: 0) 0.518 us | 0xffffffff92505cb4(); swapper/0-1 [000] d..1. 56.286595: 0) | 0xffffffff92db83c4() { swapper/0-1 [000] d..1. 56.286596: 0) | 0xffffffff92dec2e4() { swapper/0-1 [000] d..1. 56.286597: 0) | 0xffffffff92db5304() { It now shows: # trace-cmd show -B boot_mapped | tail -n30 swapper/0-1 [000] d..2. 363.079099: 0) 0.483 us | preempt_count_sub(); swapper/0-1 [000] d.... 363.079100: 0) 4.112 us | } swapper/0-1 [000] d.... 363.079101: 0) 4.979 us | } swapper/0-1 [000] d..1. 363.079101: 0) | disable_local_APIC() { swapper/0-1 [000] d..1. 363.079102: 0) + 29.153 us | clear_local_APIC.part.0(); swapper/0-1 [000] d.... 363.079148: 0) + 46.517 us | } swapper/0-1 [000] d..1. 363.079149: 0) | mcheck_cpu_clear() { swapper/0-1 [000] d..1. 363.079149: 0) | mce_intel_feature_clear() { swapper/0-1 [000] d..1. 363.079150: 0) 5.871 us | lmce_supported(); swapper/0-1 [000] d.... 363.079161: 0) + 11.340 us | } swapper/0-1 [000] d.... 363.079161: 0) + 12.638 us | } swapper/0-1 [000] d.... 363.079162: 0) ! 383.518 us | } swapper/0-1 [000] d..1. 363.079162: 0) | lapic_shutdown() { swapper/0-1 [000] d..1. 363.079163: 0) | disable_local_APIC() { swapper/0-1 [000] d..1. 363.079163: 0) + 26.144 us | clear_local_APIC.part.0(); swapper/0-1 [000] d.... 363.079192: 0) + 29.424 us | } swapper/0-1 [000] d.... 363.079192: 0) + 30.376 us | } swapper/0-1 [000] d..1. 363.079193: 0) | restore_boot_irq_mode() { swapper/0-1 [000] d..1. 363.079194: 0) | native_restore_boot_irq_mode() { swapper/0-1 [000] d..1. 363.079194: 0) + 13.863 us | disconnect_bsp_APIC(); swapper/0-1 [000] d.... 363.079209: 0) + 14.933 us | } swapper/0-1 [000] d.... 363.079209: 0) + 16.009 us | } swapper/0-1 [000] d..1. 363.079210: 0) 0.694 us | hpet_disable(); swapper/0-1 [000] d..1. 363.079211: 0) 0.511 us | iommu_shutdown_noop(); swapper/0-1 [000] d.... 363.079212: 0) # 3980.260 us | } swapper/0-1 [000] d..1. 363.079212: 0) | native_machine_emergency_restart() { swapper/0-1 [000] d..1. 363.079213: 0) 0.495 us | tboot_shutdown(); swapper/0-1 [000] d..1. 363.079230: 0) | acpi_reboot() { swapper/0-1 [000] d..1. 363.079231: 0) | acpi_reset() { swapper/0-1 [000] d..1. 363.079232: 0) | acpi_os_write_port() { Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Linus Torvalds Cc: Ross Zwisler Cc: Vincent Donnefort Link: https://lore.kernel.org/20240813171257.478901820@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 13d0387ac6a66..a569daaac4c4f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -544,6 +544,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; + addr += iter->tr->text_delta; + if (addr < (unsigned long)__irqentry_text_start || addr >= (unsigned long)__irqentry_text_end) return; @@ -710,6 +712,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; + unsigned long func; int cpu = iter->cpu; int i; @@ -717,6 +720,8 @@ print_graph_entry_leaf(struct trace_iterator *iter, call = &entry->graph_ent; duration = graph_ret->rettime - graph_ret->calltime; + func = call->func + iter->tr->text_delta; + if (data) { struct fgraph_cpu_data *cpu_data; @@ -747,10 +752,10 @@ print_graph_entry_leaf(struct trace_iterator *iter, * enabled. */ if (flags & __TRACE_GRAPH_PRINT_RETVAL) - print_graph_retval(s, graph_ret->retval, true, (void *)call->func, + print_graph_retval(s, graph_ret->retval, true, (void *)func, !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); else - trace_seq_printf(s, "%ps();\n", (void *)call->func); + trace_seq_printf(s, "%ps();\n", (void *)func); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -766,6 +771,7 @@ print_graph_entry_nested(struct trace_iterator *iter, struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; + unsigned long func; int i; if (data) { @@ -788,7 +794,9 @@ print_graph_entry_nested(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); - trace_seq_printf(s, "%ps() {\n", (void *)call->func); + func = call->func + iter->tr->text_delta; + + trace_seq_printf(s, "%ps() {\n", (void *)func); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; @@ -863,6 +871,8 @@ check_irq_entry(struct trace_iterator *iter, u32 flags, int *depth_irq; struct fgraph_data *data = iter->private; + addr += iter->tr->text_delta; + /* * If we are either displaying irqs, or we got called as * a graph event and private data does not exist, @@ -990,11 +1000,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, unsigned long long duration = trace->rettime - trace->calltime; struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; + unsigned long func; pid_t pid = ent->pid; int cpu = iter->cpu; int func_match = 1; int i; + func = trace->func + iter->tr->text_delta; + if (check_irq_return(iter, flags, trace->depth)) return TRACE_TYPE_HANDLED; @@ -1033,7 +1046,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * function-retval option is enabled. */ if (flags & __TRACE_GRAPH_PRINT_RETVAL) { - print_graph_retval(s, trace->retval, false, (void *)trace->func, + print_graph_retval(s, trace->retval, false, (void *)func, !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); } else { /* @@ -1046,7 +1059,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) trace_seq_puts(s, "}\n"); else - trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + trace_seq_printf(s, "} /* %ps */\n", (void *)func); } /* Overrun */ From bca704f62db2e2bb644fa845e98b08cf1e22f814 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Aug 2024 11:36:29 -0400 Subject: [PATCH 19/28] ring-buffer: Don't reset persistent ring-buffer meta saved addresses The text and data address is saved in the meta data so that it can be used to know the delta of the text and data addresses of the last boot compared to the text and data addresses of the current boot. The delta is used to convert function pointer entries in the ring buffer to something that can be used by kallsyms (note this only works for built-in functions). But the saved addresses get reset on boot up. If the buffer is not used and there's another reboot, then the saved text and data addresses will be of the last boot and not that of the boot that created the content in the ring buffer. To get an idea of the issue: # trace-cmd start -B boot_mapped -p function # reboot # trace-cmd show -B boot_mapped | tail <...>-1 [000] d..1. 461.983243: native_apic_msr_write <-native_kick_ap <...>-1 [000] d..1. 461.983244: __pfx_native_apic_msr_eoi <-native_kick_ap <...>-1 [000] d..1. 461.983244: reserve_irq_vector_locked <-native_kick_ap <...>-1 [000] d..1. 461.983262: branch_emulate_op <-native_kick_ap <...>-1 [000] d..1. 461.983262: __ia32_sys_ia32_pread64 <-native_kick_ap <...>-1 [000] d..1. 461.983263: native_kick_ap <-__smpboot_create_thread <...>-1 [000] d..1. 461.983263: store_cache_disable <-native_kick_ap <...>-1 [000] d..1. 461.983279: acpi_power_off_prepare <-native_kick_ap <...>-1 [000] d..1. 461.983280: __pfx_acpi_ns_delete_node <-acpi_suspend_enter <...>-1 [000] d..1. 461.983280: __pfx_acpi_os_release_lock <-acpi_suspend_enter # reboot # trace-cmd show -B boot_mapped |tail <...>-1 [000] d..1. 461.983243: 0xffffffffa9669220 <-0xffffffffa965f3db <...>-1 [000] d..1. 461.983244: 0xffffffffa96690f0 <-0xffffffffa965f3db <...>-1 [000] d..1. 461.983244: 0xffffffffa9663fa0 <-0xffffffffa965f3db <...>-1 [000] d..1. 461.983262: 0xffffffffa9672e80 <-0xffffffffa965f3e0 <...>-1 [000] d..1. 461.983262: 0xffffffffa962b940 <-0xffffffffa965f3ec <...>-1 [000] d..1. 461.983263: 0xffffffffa965f540 <-0xffffffffa96e1362 <...>-1 [000] d..1. 461.983263: 0xffffffffa963c940 <-0xffffffffa965f55b <...>-1 [000] d..1. 461.983279: 0xffffffffa9ee30c0 <-0xffffffffa965f59b <...>-1 [000] d..1. 461.983280: 0xffffffffa9f16c10 <-0xffffffffa9ee3157 <...>-1 [000] d..1. 461.983280: 0xffffffffa9ee02e0 <-0xffffffffa9ee3157 By not updating the saved text and data addresses in the meta data at every boot up and only updating them when the buffer is reset, it allows multiple boots to see the same data. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20240815113629.0dc90af8@rorschach.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8e3a7123937ac..b16f301b8a933 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1817,12 +1817,19 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) /* Used to calculate data delta */ static char rb_data_ptr[] = ""; +#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) +#define THIS_DATA_PTR ((unsigned long)rb_data_ptr) + +static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) +{ + meta->text_addr = THIS_TEXT_PTR; + meta->data_addr = THIS_DATA_PTR; +} + static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; unsigned long delta; - unsigned long this_text = (unsigned long)rb_range_meta_init; - unsigned long this_data = (unsigned long)rb_data_ptr; void *subbuf; int cpu; int i; @@ -1839,10 +1846,8 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; - buffer->last_text_delta = this_text - meta->text_addr; - buffer->last_data_delta = this_data - meta->data_addr; - meta->text_addr = this_text; - meta->data_addr = this_data; + buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; + buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; continue; } @@ -1859,8 +1864,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; - meta->text_addr = this_text; - meta->data_addr = this_data; + rb_meta_init_text_addr(meta); /* * The buffers[] array holds the order of the sub-buffers @@ -5990,6 +5994,7 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -6008,6 +6013,11 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); + /* Make sure persistent meta now uses this buffer's addresses */ + meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); + if (meta) + rb_meta_init_text_addr(meta); + mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -6022,6 +6032,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ @@ -6049,6 +6060,11 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) reset_disabled_cpu_buffer(cpu_buffer); + /* Make sure persistent meta now uses this buffer's addresses */ + meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); + if (meta) + rb_meta_init_text_addr(meta); + atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } From d0f2d6e9512ecf4306c4432761f04bd35cf9e3a6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Aug 2024 11:50:32 -0400 Subject: [PATCH 20/28] ring-buffer: Add magic and struct size to boot up meta data Add a magic number as well as save the struct size of the ring_buffer_meta structure in the meta data to also use as validation. Updating the magic number could be used to force a invalidation between kernel versions, and saving the structure size is also a good method to make sure the content is what is expected. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20240815115032.0c197b32@rorschach.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b16f301b8a933..c3a5e6cbb9403 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -44,7 +44,11 @@ static void update_pages_handler(struct work_struct *work); +#define RING_BUFFER_META_MAGIC 0xBADFEED + struct ring_buffer_meta { + int magic; + int struct_size; unsigned long text_addr; unsigned long data_addr; unsigned long first_buffer; @@ -1627,6 +1631,13 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, unsigned long buffers_end; int i; + /* Check the meta magic and meta struct size */ + if (meta->magic != RING_BUFFER_META_MAGIC || + meta->struct_size != sizeof(*meta)) { + pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); + return false; + } + /* The subbuffer's size and number of subbuffers must match */ if (meta->subbuf_size != subbuf_size || meta->nr_subbufs != nr_pages + 1) { @@ -1858,6 +1869,9 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) memset(meta, 0, next_meta - (void *)meta); + meta->magic = RING_BUFFER_META_MAGIC; + meta->struct_size = sizeof(*meta); + meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; From eb2dcde9f970ed8d3669444d47c8524b4bdf7d32 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 28 Jun 2024 11:46:11 +0100 Subject: [PATCH 21/28] ring-buffer: Align meta-page to sub-buffers for improved TLB usage Previously, the mapped ring-buffer layout caused misalignment between the meta-page and sub-buffers when the sub-buffer size was not a multiple of PAGE_SIZE. This prevented hardware with larger TLB entries from utilizing them effectively. Add a padding with the zero-page between the meta-page and sub-buffers. Also update the ring-buffer map_test to verify that padding. Link: https://lore.kernel.org/20240628104611.1443542-1-vdonnefort@google.com Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 33 +++++++++++-------- .../testing/selftests/ring-buffer/map_test.c | 14 ++++++++ 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c3a5e6cbb9403..77dc0b25140e6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6852,10 +6852,10 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; - meta->meta_page_size = PAGE_SIZE; meta->meta_struct_len = sizeof(*meta); meta->nr_subbufs = nr_subbufs; meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + meta->meta_page_size = meta->subbuf_size; rb_update_meta_page(cpu_buffer); } @@ -6949,6 +6949,12 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, !(vma->vm_flags & VM_MAYSHARE)) return -EPERM; + subbuf_order = cpu_buffer->buffer->subbuf_order; + subbuf_pages = 1 << subbuf_order; + + if (subbuf_order && pgoff % subbuf_pages) + return -EINVAL; + /* * Make sure the mapping cannot become writable later. Also tell the VM * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). @@ -6958,11 +6964,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, lockdep_assert_held(&cpu_buffer->mapping_lock); - subbuf_order = cpu_buffer->buffer->subbuf_order; - subbuf_pages = 1 << subbuf_order; - nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ - nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */ + nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */ nr_vma_pages = vma_pages(vma); if (!nr_vma_pages || nr_vma_pages > nr_pages) @@ -6975,20 +6978,24 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, return -ENOMEM; if (!pgoff) { + unsigned long meta_page_padding; + pages[p++] = virt_to_page(cpu_buffer->meta_page); /* - * TODO: Align sub-buffers on their size, once - * vm_insert_pages() supports the zero-page. + * Pad with the zero-page to align the meta-page with the + * sub-buffers. */ - } else { - /* Skip the meta-page */ - pgoff--; + meta_page_padding = subbuf_pages - 1; + while (meta_page_padding-- && p < nr_pages) { + unsigned long __maybe_unused zero_addr = + vma->vm_start + (PAGE_SIZE * p); - if (pgoff % subbuf_pages) { - err = -EINVAL; - goto out; + pages[p++] = ZERO_PAGE(zero_addr); } + } else { + /* Skip the meta-page */ + pgoff -= subbuf_pages; s += pgoff / subbuf_pages; } diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c index a9006fa7097ed..4bb0192e43f38 100644 --- a/tools/testing/selftests/ring-buffer/map_test.c +++ b/tools/testing/selftests/ring-buffer/map_test.c @@ -228,6 +228,20 @@ TEST_F(map, data_mmap) data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, desc->cpu_fd, meta_len); ASSERT_EQ(data, MAP_FAILED); + + /* Verify meta-page padding */ + if (desc->meta->meta_page_size > getpagesize()) { + void *addr; + + data_len = desc->meta->meta_page_size; + data = mmap(NULL, data_len, + PROT_READ, MAP_SHARED, desc->cpu_fd, 0); + ASSERT_NE(data, MAP_FAILED); + + addr = (void *)((unsigned long)data + getpagesize()); + ASSERT_EQ(*((int *)addr), 0); + munmap(data, data_len); + } } FIXTURE(snapshot) { From b6fc31b68731af71e408b4762ac0fbce4e40223e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Aug 2024 21:39:03 -0400 Subject: [PATCH 22/28] tracing: Add "traceoff" flag to boot time tracing instances Add a "flags" delimiter (^) to the "trace_instance" kernel command line parameter, and add the "traceoff" flag. The format is: trace_instance=[^[^]][@][,] The code allows for more than one flag to be added, but currently only "traceoff" is done so. The motivation for this change came from debugging with the persistent ring buffer and having trace_printk() writing to it. The trace_printk calls are always enabled, and the boot after the crash was having the unwanted trace_printks from the current boot inject into the ring buffer with the trace_printks of the crash kernel, making the output very confusing. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Cc: Alexander Aring Cc: "Luis Claudio R. Goncalves" Cc: Tomas Glozar Cc: John Kacur Cc: Clark Williams Cc: Linus Torvalds Cc: "Jonathan Corbet" Link: https://lore.kernel.org/20240823014019.053229958@goodmis.org Signed-off-by: Steven Rostedt (Google) --- .../admin-guide/kernel-parameters.txt | 17 ++++++++++ kernel/trace/trace.c | 31 ++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 388653448e728..3803f2b7f065b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6743,6 +6743,15 @@ the same thing would happen if it was left off). The irq_handler_entry event, and all events under the "initcall" system. + Flags can be added to the instance to modify its behavior when it is + created. The flags are separated by '^'. Currently there's only one flag + defined, and that's "traceoff", to have the tracing instance tracing + disabled after it is created. + + trace_instance=foo^traceoff,sched,irq + + The flags must come before the defined events. + If memory has been reserved (see memmap for x86), the instance can use that memory: @@ -6765,6 +6774,14 @@ kernel versions where the validator will fail and reset the ring buffer if the layout is not the same as the previous kernel. + If the ring buffer is used for persistent bootups and has events enabled, + it is recommend to disable tracing so that events from a previous boot do not + mix with events of the current boot (unless you are debugging a random crash + at boot up). + + reserve_mem=12M:4096:trace trace_instance=boot_map^traceoff@trace,sched,irq + + trace_options=[option-list] [FTRACE] Enable or disable tracer options at boot. The option-list is a comma delimited list of options diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9bcef199ae90a..a79eefe84d6b3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10468,10 +10468,36 @@ __init static void enable_instances(void) phys_addr_t start = 0; phys_addr_t size = 0; unsigned long addr = 0; + bool traceoff = false; + char *flag_delim; + char *addr_delim; tok = strsep(&curr_str, ","); - name = strsep(&tok, "@"); + flag_delim = strchr(tok, '^'); + addr_delim = strchr(tok, '@'); + + if (addr_delim) + *addr_delim++ = '\0'; + + if (flag_delim) + *flag_delim++ = '\0'; + + name = tok; + + if (flag_delim) { + char *flag; + + while ((flag = strsep(&flag_delim, "^"))) { + if (strcmp(flag, "traceoff") == 0) + traceoff = true; + else + pr_info("Tracing: Invalid instance flag '%s' for %s\n", + flag, name); + } + } + + tok = addr_delim; if (tok && isdigit(*tok)) { start = memparse(tok, &tok); if (!start) { @@ -10519,6 +10545,9 @@ __init static void enable_instances(void) continue; } + if (traceoff) + tracer_tracing_off(tr); + /* Only allow non mapped buffers to be deleted */ if (!start) trace_array_put(tr); From ddb8ea9e5ae482c469bcfd61cc83399bef67beb8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Aug 2024 21:39:04 -0400 Subject: [PATCH 23/28] tracing: Allow trace_printk() to go to other instance buffers Currently, trace_printk() just goes to the top level ring buffer. But there may be times that it should go to one of the instances created by the kernel command line. Add a new trace_instance flag: traceprintk (also can use "printk" or "trace_printk" as people tend to forget the actual flag name). trace_instance=foo^traceprintk Will assign the trace_printk to this buffer at boot up. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Cc: Alexander Aring Cc: "Luis Claudio R. Goncalves" Cc: Tomas Glozar Cc: John Kacur Cc: Clark Williams Cc: Linus Torvalds Cc: "Jonathan Corbet" Link: https://lore.kernel.org/20240823014019.226694946@goodmis.org Signed-off-by: Steven Rostedt (Google) --- .../admin-guide/kernel-parameters.txt | 14 ++++-- kernel/trace/trace.c | 46 ++++++++++++++----- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3803f2b7f065b..a8803c0c0a89c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6744,11 +6744,17 @@ event, and all events under the "initcall" system. Flags can be added to the instance to modify its behavior when it is - created. The flags are separated by '^'. Currently there's only one flag - defined, and that's "traceoff", to have the tracing instance tracing - disabled after it is created. + created. The flags are separated by '^'. - trace_instance=foo^traceoff,sched,irq + The available flags are: + + traceoff - Have the tracing instance tracing disabled after it is created. + traceprintk - Have trace_printk() write into this trace instance + (note, "printk" and "trace_printk" can also be used) + Currently, traceprintk flag cannot be used for memory + mapped ring buffers as described below. + + trace_instance=foo^traceoff^traceprintk,sched,irq The flags must come before the defined events. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a79eefe84d6b3..8e28f19f5316c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -500,6 +500,8 @@ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; +static struct trace_array *printk_trace = &global_trace; + void trace_set_ring_buffer_expanded(struct trace_array *tr) { if (!tr) @@ -1117,7 +1119,7 @@ EXPORT_SYMBOL_GPL(__trace_array_puts); */ int __trace_puts(unsigned long ip, const char *str, int size) { - return __trace_array_puts(&global_trace, ip, str, size); + return __trace_array_puts(printk_trace, ip, str, size); } EXPORT_SYMBOL_GPL(__trace_puts); @@ -1128,6 +1130,7 @@ EXPORT_SYMBOL_GPL(__trace_puts); */ int __trace_bputs(unsigned long ip, const char *str) { + struct trace_array *tr = printk_trace; struct ring_buffer_event *event; struct trace_buffer *buffer; struct bputs_entry *entry; @@ -1135,14 +1138,14 @@ int __trace_bputs(unsigned long ip, const char *str) int size = sizeof(struct bputs_entry); int ret = 0; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; trace_ctx = tracing_gen_ctx(); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, @@ -1155,7 +1158,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); ret = 1; out: @@ -3025,7 +3028,7 @@ void trace_dump_stack(int skip) /* Skip 1 to skip this function. */ skip++; #endif - __ftrace_trace_stack(global_trace.array_buffer.buffer, + __ftrace_trace_stack(printk_trace->array_buffer.buffer, tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); @@ -3244,7 +3247,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_buffer *buffer; - struct trace_array *tr = &global_trace; + struct trace_array *tr = printk_trace; struct bprint_entry *entry; unsigned int trace_ctx; char *tbuffer; @@ -3342,7 +3345,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); } out: @@ -3438,7 +3441,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer, int ret; va_list ap; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); @@ -3450,7 +3453,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer, __printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { - return trace_array_vprintk(&global_trace, ip, fmt, args); + return trace_array_vprintk(printk_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); @@ -9666,6 +9669,9 @@ static int __remove_instance(struct trace_array *tr) set_tracer_flag(tr, 1 << i, 0); } + if (printk_trace == tr) + printk_trace = &global_trace; + tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); @@ -10468,6 +10474,7 @@ __init static void enable_instances(void) phys_addr_t start = 0; phys_addr_t size = 0; unsigned long addr = 0; + bool traceprintk = false; bool traceoff = false; char *flag_delim; char *addr_delim; @@ -10489,11 +10496,16 @@ __init static void enable_instances(void) char *flag; while ((flag = strsep(&flag_delim, "^"))) { - if (strcmp(flag, "traceoff") == 0) + if (strcmp(flag, "traceoff") == 0) { traceoff = true; - else + } else if ((strcmp(flag, "printk") == 0) || + (strcmp(flag, "traceprintk") == 0) || + (strcmp(flag, "trace_printk") == 0)) { + traceprintk = true; + } else { pr_info("Tracing: Invalid instance flag '%s' for %s\n", flag, name); + } } } @@ -10548,6 +10560,18 @@ __init static void enable_instances(void) if (traceoff) tracer_tracing_off(tr); + if (traceprintk) { + /* + * The binary format of traceprintk can cause a crash if used + * by a buffer from another boot. Do not allow it for the + * memory mapped ring buffers. + */ + if (start) + pr_warn("Tracing: WARNING: memory mapped ring buffers cannot be used for trace_printk\n"); + else + printk_trace = tr; + } + /* Only allow non mapped buffers to be deleted */ if (!start) trace_array_put(tr); From 9b7bdf6f6ece6ea888cc7d2f02c00b403b66a119 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Aug 2024 21:39:05 -0400 Subject: [PATCH 24/28] tracing: Have trace_printk not use binary prints if boot buffer If the persistent boot mapped ring buffer is used for trace_printk(), force it to not use the binary versions. trace_printk() by default uses bin_printf() that only saves the pointer to the format and not the format itself inside the ring buffer. But for a persistent buffer that is read after reboot, the pointers to the format strings may not be the same, or worse, not even exist! Instead, just force the more robust, but slower, version that does the formatting before saving into the ring buffer. The boot mapped buffer can now be used for trace_printk and friends! Using the trace_printk() and the persistent buffer was used to debug the issue with the osnoise tracer: Link: https://lore.kernel.org/all/20240822103443.6a6ae051@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Cc: Alexander Aring Cc: "Luis Claudio R. Goncalves" Cc: Tomas Glozar Cc: John Kacur Cc: Clark Williams Cc: Linus Torvalds Cc: "Jonathan Corbet" Link: https://lore.kernel.org/20240823014019.386925800@goodmis.org Signed-off-by: Steven Rostedt (Google) --- .../admin-guide/kernel-parameters.txt | 4 +- kernel/trace/trace.c | 44 ++++++++++++------- kernel/trace/trace.h | 3 +- kernel/trace/trace_output.c | 5 ++- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8803c0c0a89c..9e507e6cb4c8a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6751,8 +6751,6 @@ traceoff - Have the tracing instance tracing disabled after it is created. traceprintk - Have trace_printk() write into this trace instance (note, "printk" and "trace_printk" can also be used) - Currently, traceprintk flag cannot be used for memory - mapped ring buffers as described below. trace_instance=foo^traceoff^traceprintk,sched,irq @@ -6785,7 +6783,7 @@ mix with events of the current boot (unless you are debugging a random crash at boot up). - reserve_mem=12M:4096:trace trace_instance=boot_map^traceoff@trace,sched,irq + reserve_mem=12M:4096:trace trace_instance=boot_map^traceoff^traceprintk@trace,sched,irq trace_options=[option-list] diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8e28f19f5316c..35b37c9aa26c5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -502,6 +502,17 @@ static struct trace_array global_trace = { static struct trace_array *printk_trace = &global_trace; +static __always_inline bool printk_binsafe(struct trace_array *tr) +{ + /* + * The binary format of traceprintk can cause a crash if used + * by a buffer from another boot. Force the use of the + * non binary version of trace_printk if the trace_printk + * buffer is a boot mapped ring buffer. + */ + return !(tr->flags & TRACE_ARRAY_FL_BOOT); +} + void trace_set_ring_buffer_expanded(struct trace_array *tr) { if (!tr) @@ -1130,7 +1141,7 @@ EXPORT_SYMBOL_GPL(__trace_puts); */ int __trace_bputs(unsigned long ip, const char *str) { - struct trace_array *tr = printk_trace; + struct trace_array *tr = READ_ONCE(printk_trace); struct ring_buffer_event *event; struct trace_buffer *buffer; struct bputs_entry *entry; @@ -1138,6 +1149,9 @@ int __trace_bputs(unsigned long ip, const char *str) int size = sizeof(struct bputs_entry); int ret = 0; + if (!printk_binsafe(tr)) + return __trace_puts(ip, str, strlen(str)); + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; @@ -3247,12 +3261,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_buffer *buffer; - struct trace_array *tr = printk_trace; + struct trace_array *tr = READ_ONCE(printk_trace); struct bprint_entry *entry; unsigned int trace_ctx; char *tbuffer; int len = 0, size; + if (!printk_binsafe(tr)) + return trace_vprintk(ip, fmt, args); + if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -10560,20 +10577,17 @@ __init static void enable_instances(void) if (traceoff) tracer_tracing_off(tr); - if (traceprintk) { - /* - * The binary format of traceprintk can cause a crash if used - * by a buffer from another boot. Do not allow it for the - * memory mapped ring buffers. - */ - if (start) - pr_warn("Tracing: WARNING: memory mapped ring buffers cannot be used for trace_printk\n"); - else - printk_trace = tr; - } + if (traceprintk) + printk_trace = tr; - /* Only allow non mapped buffers to be deleted */ - if (!start) + /* + * If start is set, then this is a mapped buffer, and + * cannot be deleted by user space, so keep the reference + * to it. + */ + if (start) + tr->flags |= TRACE_ARRAY_FL_BOOT; + else trace_array_put(tr); while ((tok = strsep(&curr_str, ","))) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4f448ab2d1e7c..07b2d2af9b339 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -429,7 +429,8 @@ struct trace_array { }; enum { - TRACE_ARRAY_FL_GLOBAL = (1 << 0) + TRACE_ARRAY_FL_GLOBAL = BIT(0), + TRACE_ARRAY_FL_BOOT = BIT(1), }; extern struct list_head ftrace_trace_arrays; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 48de935988972..868f2f912f280 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1591,10 +1591,13 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, { struct print_entry *field; struct trace_seq *s = &iter->seq; + unsigned long ip; trace_assign_type(field, iter->ent); - seq_print_ip_sym(s, field->ip, flags); + ip = field->ip + iter->tr->text_delta; + + seq_print_ip_sym(s, ip, flags); trace_seq_printf(s, ": %s", field->buf); return trace_handle_return(s); From ef2bd81d0c95616fab718738be48d7cc9b23e33d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Aug 2024 21:39:06 -0400 Subject: [PATCH 25/28] tracing: Add option to set an instance to be the trace_printk destination Add a option "trace_printk_dest" that will make the tracing instance the location that trace_printk() will go to. This is useful if the trace_printk or one of the top level tracers is too noisy and there's a need to separate the two. Then an instance can be created, the trace_printk can be set to go there instead, where it will not be lost in the noise of the top level tracer. Note, only one instance can be the destination of trace_printk at a time. If an instance sets this flag, the instance that had it set will have it cleared. There is always one instance that has this set. By default, that is the top instance. This flag cannot be cleared from the top instance. Doing so will result in an -EINVAL. The only way this flag can be cleared from the top instance is by another instance setting it. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Cc: Alexander Aring Cc: "Luis Claudio R. Goncalves" Cc: Tomas Glozar Cc: John Kacur Cc: Clark Williams Cc: Linus Torvalds Cc: "Jonathan Corbet" Link: https://lore.kernel.org/20240823014019.545459018@goodmis.org Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 12 ++++++++++ kernel/trace/trace.c | 40 +++++++++++++++++++++++++++++----- kernel/trace/trace.h | 1 + 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 5aba74872ba77..4073ca48af4ad 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -1186,6 +1186,18 @@ Here are the available options: trace_printk Can disable trace_printk() from writing into the buffer. + trace_printk_dest + Set to have trace_printk() and similar internal tracing functions + write into this instance. Note, only one trace instance can have + this set. By setting this flag, it clears the trace_printk_dest flag + of the instance that had it set previously. By default, the top + level trace has this set, and will get it set again if another + instance has it set then clears it. + + This flag cannot be cleared by the top level instance, as it is the + default instance. The only way the top level instance has this flag + cleared, is by it being set in another instance. + annotate It is sometimes confusing when the CPU buffers are full and one CPU buffer had a lot of events recently, thus diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 35b37c9aa26c5..658b40b483a34 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -482,7 +482,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ - TRACE_ITER_HASH_PTR) + TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ @@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK) /* * The global_trace is the descriptor that holds the top-level tracing @@ -513,6 +513,16 @@ static __always_inline bool printk_binsafe(struct trace_array *tr) return !(tr->flags & TRACE_ARRAY_FL_BOOT); } +static void update_printk_trace(struct trace_array *tr) +{ + if (printk_trace == tr) + return; + + printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK; + printk_trace = tr; + tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; +} + void trace_set_ring_buffer_expanded(struct trace_array *tr) { if (!tr) @@ -5300,7 +5310,8 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { if ((mask == TRACE_ITER_RECORD_TGID) || - (mask == TRACE_ITER_RECORD_CMD)) + (mask == TRACE_ITER_RECORD_CMD) || + (mask == TRACE_ITER_TRACE_PRINTK)) lockdep_assert_held(&event_mutex); /* do nothing if flag is already set */ @@ -5312,6 +5323,25 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; + if (mask == TRACE_ITER_TRACE_PRINTK) { + if (enabled) { + update_printk_trace(tr); + } else { + /* + * The global_trace cannot clear this. + * It's flag only gets cleared if another instance sets it. + */ + if (printk_trace == &global_trace) + return -EINVAL; + /* + * An instance must always have it set. + * by default, that's the global_trace instane. + */ + if (printk_trace == tr) + update_printk_trace(&global_trace); + } + } + if (enabled) tr->trace_flags |= mask; else @@ -9687,7 +9717,7 @@ static int __remove_instance(struct trace_array *tr) } if (printk_trace == tr) - printk_trace = &global_trace; + update_printk_trace(&global_trace); tracing_set_nop(tr); clear_ftrace_function_probes(tr); @@ -10578,7 +10608,7 @@ __init static void enable_instances(void) tracer_tracing_off(tr); if (traceprintk) - printk_trace = tr; + update_printk_trace(tr); /* * If start is set, then this is a mapped buffer, and diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 07b2d2af9b339..c866991b9c78b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1321,6 +1321,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ + C(TRACE_PRINTK, "trace_printk_dest"), \ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ From 2fcd5aff92aab479a9a89cfce2dbc9c6a9455b4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Aug 2024 21:39:07 -0400 Subject: [PATCH 26/28] tracing/Documentation: Start a document on how to debug with tracing Add a new document Documentation/trace/debugging.rst that will hold various ways to debug tracing. This initial version mentions trace_printk and how to create persistent buffers that can last across bootups. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Cc: Alexander Aring Cc: "Luis Claudio R. Goncalves" Cc: Tomas Glozar Cc: John Kacur Cc: Clark Williams Cc: Linus Torvalds Cc: "Jonathan Corbet" Link: https://lore.kernel.org/20240823014019.702433486@goodmis.org Signed-off-by: Steven Rostedt (Google) --- .../admin-guide/kernel-parameters.txt | 2 + Documentation/trace/debugging.rst | 159 ++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 Documentation/trace/debugging.rst diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9e507e6cb4c8a..9bb50dc783382 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6785,6 +6785,8 @@ reserve_mem=12M:4096:trace trace_instance=boot_map^traceoff^traceprintk@trace,sched,irq + See also Documentation/trace/debugging.rst + trace_options=[option-list] [FTRACE] Enable or disable tracer options at boot. diff --git a/Documentation/trace/debugging.rst b/Documentation/trace/debugging.rst new file mode 100644 index 0000000000000..54fb16239d703 --- /dev/null +++ b/Documentation/trace/debugging.rst @@ -0,0 +1,159 @@ +============================== +Using the tracer for debugging +============================== + +Copyright 2024 Google LLC. + +:Author: Steven Rostedt +:License: The GNU Free Documentation License, Version 1.2 + (dual licensed under the GPL v2) + +- Written for: 6.12 + +Introduction +------------ +The tracing infrastructure can be very useful for debugging the Linux +kernel. This document is a place to add various methods of using the tracer +for debugging. + +First, make sure that the tracefs file system is mounted:: + + $ sudo mount -t tracefs tracefs /sys/kernel/tracing + + +Using trace_printk() +-------------------- + +trace_printk() is a very lightweight utility that can be used in any context +inside the kernel, with the exception of "noinstr" sections. It can be used +in normal, softirq, interrupt and even NMI context. The trace data is +written to the tracing ring buffer in a lockless way. To make it even +lighter weight, when possible, it will only record the pointer to the format +string, and save the raw arguments into the buffer. The format and the +arguments will be post processed when the ring buffer is read. This way the +trace_printk() format conversions are not done during the hot path, where +the trace is being recorded. + +trace_printk() is meant only for debugging, and should never be added into +a subsystem of the kernel. If you need debugging traces, add trace events +instead. If a trace_printk() is found in the kernel, the following will +appear in the dmesg:: + + ********************************************************** + ** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE ** + ** ** + ** trace_printk() being used. Allocating extra memory. ** + ** ** + ** This means that this is a DEBUG kernel and it is ** + ** unsafe for production use. ** + ** ** + ** If you see this message and you are not debugging ** + ** the kernel, report this immediately to your vendor! ** + ** ** + ** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE ** + ********************************************************** + +Debugging kernel crashes +------------------------ +There is various methods of acquiring the state of the system when a kernel +crash occurs. This could be from the oops message in printk, or one could +use kexec/kdump. But these just show what happened at the time of the crash. +It can be very useful in knowing what happened up to the point of the crash. +The tracing ring buffer, by default, is a circular buffer than will +overwrite older events with newer ones. When a crash happens, the content of +the ring buffer will be all the events that lead up to the crash. + +There are several kernel command line parameters that can be used to help in +this. The first is "ftrace_dump_on_oops". This will dump the tracing ring +buffer when a oops occurs to the console. This can be useful if the console +is being logged somewhere. If a serial console is used, it may be prudent to +make sure the ring buffer is relatively small, otherwise the dumping of the +ring buffer may take several minutes to hours to finish. Here's an example +of the kernel command line:: + + ftrace_dump_on_oops trace_buf_size=50K + +Note, the tracing buffer is made up of per CPU buffers where each of these +buffers is broken up into sub-buffers that are by default PAGE_SIZE. The +above trace_buf_size option above sets each of the per CPU buffers to 50K, +so, on a machine with 8 CPUs, that's actually 400K total. + +Persistent buffers across boots +------------------------------- +If the system memory allows it, the tracing ring buffer can be specified at +a specific location in memory. If the location is the same across boots and +the memory is not modified, the tracing buffer can be retrieved from the +following boot. There's two ways to reserve memory for the use of the ring +buffer. + +The more reliable way (on x86) is to reserve memory with the "memmap" kernel +command line option and then use that memory for the trace_instance. This +requires a bit of knowledge of the physical memory layout of the system. The +advantage of using this method, is that the memory for the ring buffer will +always be the same:: + + memmap==12M$0x284500000 trace_instance=boot_map@0x284500000:12M + +The memmap above reserves 12 megabytes of memory at the physical memory +location 0x284500000. Then the trace_instance option will create a trace +instance "boot_map" at that same location with the same amount of memory +reserved. As the ring buffer is broke up into per CPU buffers, the 12 +megabytes will be broken up evenly between those CPUs. If you have 8 CPUs, +each per CPU ring buffer will be 1.5 megabytes in size. Note, that also +includes meta data, so the amount of memory actually used by the ring buffer +will be slightly smaller. + +Another more generic but less robust way to allocate a ring buffer mapping +at boot is with the "reserve_mem" option:: + + reserve_mem=12M:4096:trace trace_instance=boot_map@trace + +The reserve_mem option above will find 12 megabytes that are available at +boot up, and align it by 4096 bytes. It will label this memory as "trace" +that can be used by later command line options. + +The trace_instance option creates a "boot_map" instance and will use the +memory reserved by reserve_mem that was labeled as "trace". This method is +more generic but may not be as reliable. Due to KASLR, the memory reserved +by reserve_mem may not be located at the same location. If this happens, +then the ring buffer will not be from the previous boot and will be reset. + +Sometimes, by using a larger alignment, it can keep KASLR from moving things +around in such a way that it will move the location of the reserve_mem. By +using a larger alignment, you may find better that the buffer is more +consistent to where it is placed:: + + reserve_mem=12M:0x2000000:trace trace_instance=boot_map@trace + +On boot up, the memory reserved for the ring buffer is validated. It will go +through a series of tests to make sure that the ring buffer contains valid +data. If it is, it will then set it up to be available to read from the +instance. If it fails any of the tests, it will clear the entire ring buffer +and initialize it as new. + +The layout of this mapped memory may not be consistent from kernel to +kernel, so only the same kernel is guaranteed to work if the mapping is +preserved. Switching to a different kernel version may find a different +layout and mark the buffer as invalid. + +Using trace_printk() in the boot instance +----------------------------------------- +By default, the content of trace_printk() goes into the top level tracing +instance. But this instance is never preserved across boots. To have the +trace_printk() content, and some other internal tracing go to the preserved +buffer (like dump stacks), either set the instance to be the trace_printk() +destination from the kernel command line, or set it after boot up via the +trace_printk_dest option. + +After boot up:: + + echo 1 > /sys/kernel/tracing/instances/boot_map/options/trace_printk_dest + +From the kernel command line:: + + reserve_mem=12M:4096:trace trace_instance=boot_map^traceprintk^traceoff@trace + +If setting it from the kernel command line, it is recommended to also +disable tracing with the "traceoff" flag, and enable tracing after boot up. +Otherwise the trace from the most recent boot will be mixed with the trace +from the previous boot, and may make it confusing to read. From 21ff365b5c88c0bf8447989aadb5d8fe401c9cfc Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 10 Sep 2024 17:23:34 +0100 Subject: [PATCH 27/28] selftests/ring-buffer: Verify the entire meta-page padding Improve the ring-buffer meta-page test coverage by checking for the entire padding region to be 0 instead of just looking at the first 4 bytes. Cc: linux-kselftest@vger.kernel.org Link: https://lore.kernel.org/20240910162335.2993310-2-vdonnefort@google.com Acked-by: Shuah Khan Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/ring-buffer/map_test.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c index 4bb0192e43f38..ba12fd31de87e 100644 --- a/tools/testing/selftests/ring-buffer/map_test.c +++ b/tools/testing/selftests/ring-buffer/map_test.c @@ -231,15 +231,15 @@ TEST_F(map, data_mmap) /* Verify meta-page padding */ if (desc->meta->meta_page_size > getpagesize()) { - void *addr; - data_len = desc->meta->meta_page_size; data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, desc->cpu_fd, 0); ASSERT_NE(data, MAP_FAILED); - addr = (void *)((unsigned long)data + getpagesize()); - ASSERT_EQ(*((int *)addr), 0); + for (int i = desc->meta->meta_struct_len; + i < desc->meta->meta_page_size; i += sizeof(int)) + ASSERT_EQ(*(int *)(data + i), 0); + munmap(data, data_len); } } From 75d7ff9aa0ae1a8d1b3f9c8c87dde3a4fbe9a2cf Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 10 Sep 2024 17:23:35 +0100 Subject: [PATCH 28/28] selftests/ring-buffer: Handle meta-page bigger than the system Handle the case where the meta-page content is bigger than the system page-size. This prepares the ground for extending features covered by the meta-page. Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Link: https://lore.kernel.org/20240910162335.2993310-3-vdonnefort@google.com Acked-by: Shuah Khan Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/ring-buffer/map_test.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c index ba12fd31de87e..d10a847130fbc 100644 --- a/tools/testing/selftests/ring-buffer/map_test.c +++ b/tools/testing/selftests/ring-buffer/map_test.c @@ -92,12 +92,22 @@ int tracefs_cpu_map(struct tracefs_cpu_map_desc *desc, int cpu) if (desc->cpu_fd < 0) return -ENODEV; +again: map = mmap(NULL, page_size, PROT_READ, MAP_SHARED, desc->cpu_fd, 0); if (map == MAP_FAILED) return -errno; desc->meta = (struct trace_buffer_meta *)map; + /* the meta-page is bigger than the original mapping */ + if (page_size < desc->meta->meta_struct_len) { + int meta_page_size = desc->meta->meta_page_size; + + munmap(desc->meta, page_size); + page_size = meta_page_size; + goto again; + } + return 0; }