From 418c59e49ddc77fcb7054f2c8d52c9d47403b43e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Dec 2012 13:15:08 -0500 Subject: [PATCH 01/25] tracing: Fix sparse warning with is_signed_type() macro Sparse complains when is_signed_type() is used on a pointer. This macro is needed for the format output used for ftrace and perf, to know if a binary field is a signed type or not. The is_signed_type() macro is used against all fields that are recorded by events to automate the operation. The problem sparse has is with the current way is_signed_type() works: ((type)-1 < 0) If "type" is a poiner, than sparse does not like it being compared to an integer (zero). The simple fix is to just give zero the same type. The runtime result stays the same. Reported-by: Robert Jarzmik Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index a3d489531d83a..43ef8b6cce7fb 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -272,7 +272,7 @@ extern int trace_define_field(struct ftrace_event_call *call, const char *type, extern int trace_add_event_call(struct ftrace_event_call *call); extern void trace_remove_event_call(struct ftrace_event_call *call); -#define is_signed_type(type) (((type)(-1)) < 0) +#define is_signed_type(type) (((type)(-1)) < (type)0) int trace_set_clr_event(const char *system, const char *event, int set); From 771e03842a9e98a1c2013ca1ed8bb2793488f3e5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 30 Nov 2012 10:41:57 -0500 Subject: [PATCH 02/25] ring-buffer: Remove unnecessary recusive call in rb_advance_iter() The original ring-buffer code had special checks at the start of rb_advance_iter() and instead of repeating them again at the end of the function if a certain condition existed, I just did a recursive call to rb_advance_iter() because the special condition would cause rb_advance_iter() to return early (after the checks). But as things have changed, the special checks no longer exist and the only thing done for the special_condition is to call rb_inc_iter() and return. Instead of doing a confusing recursive call, just call rb_inc_iter instead. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce8514feedcdf..6ff9cc4658ed7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3425,7 +3425,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) /* check for end of page padding */ if ((iter->head >= rb_page_size(iter->head_page)) && (iter->head_page != cpu_buffer->commit_page)) - rb_advance_iter(iter); + rb_inc_iter(iter); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) From d8a0349c0cea477322c66ea9362f10c62fad5f62 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Tue, 13 Nov 2012 09:53:04 +0800 Subject: [PATCH 03/25] tracing: Use this_cpu_ptr per-cpu helper typeof(&buffer) is a pointer to array of 1024 char, or char (*)[1024]. But, typeof(&buffer[0]) is a pointer to char which match the return type of get_trace_buf(). As well-known, the value of &buffer is equal to &buffer[0]. so return this_cpu_ptr(&percpu_buffer->buffer[0]) can avoid type cast. Link: http://lkml.kernel.org/r/50A1A800.3020102@gmail.com Reviewed-by: Christoph Lameter Signed-off-by: Shan Wei Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 2 +- kernel/trace/trace.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c0bd0308741ca..71259e2b6b616 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) return; local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); + buf = this_cpu_ptr(bt->msg_data); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c13e46d7d24e..f8b7c626f3fd6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1517,7 +1517,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer; static char *get_trace_buf(void) { struct trace_buffer_struct *percpu_buffer; - struct trace_buffer_struct *buffer; /* * If we have allocated per cpu buffers, then we do not @@ -1535,9 +1534,7 @@ static char *get_trace_buf(void) if (!percpu_buffer) return NULL; - buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); - - return buffer->buffer; + return this_cpu_ptr(&percpu_buffer->buffer[0]); } static int alloc_percpu_trace_buffer(void) From d24d7dbf3cc49b00a152e55e24f0eeb173c7a971 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Wed, 18 Jul 2012 18:16:44 +0800 Subject: [PATCH 04/25] tracing: Verify target file before registering a uprobe event Without this patch, we can register a uprobe event for a directory. Enabling such a uprobe event would anyway fail. Example: $ echo 'p /bin:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events However dirctories cannot be valid targets for uprobe. Hence verify if the target is a regular file during the probe registration. Link: http://lkml.kernel.org/r/20130103004212.690763002@goodmis.org Cc: Namhyung Kim Signed-off-by: Jovi Zhang Acked-by: Srikar Dronamraju [ cleaned up whitespace and removed redundant IS_DIR() check ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c86e6d4f67fbf..87b6db4ccbc5a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -258,6 +258,10 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; inode = igrab(path.dentry->d_inode); + if (!S_ISREG(inode->i_mode)) { + ret = -EINVAL; + goto fail_address_parse; + } argc -= 2; argv += 2; @@ -356,7 +360,7 @@ static int create_trace_uprobe(int argc, char **argv) if (inode) iput(inode); - pr_info("Failed to parse address.\n"); + pr_info("Failed to parse address or file.\n"); return ret; } From 6aea49cb5f3001a8275bf9c9f586ec3eb39af194 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 21 Nov 2012 15:13:47 +0800 Subject: [PATCH 05/25] tracing/syscalls: Make local functions static Some functions in the syscall tracing is used only locally to the file, but they are labeled global. Convert them to static functions. Signed-off-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7609dd6714c2a..5329e13e74a13 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -77,7 +77,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } -enum print_line_t +static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -130,7 +130,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, return TRACE_TYPE_HANDLED; } -enum print_line_t +static enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -270,7 +270,7 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; @@ -305,7 +305,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; @@ -337,7 +337,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; @@ -356,7 +356,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; @@ -371,7 +371,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; @@ -390,7 +390,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; @@ -459,7 +459,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) return (unsigned long)sys_call_table[nr]; } -int __init init_ftrace_syscalls(void) +static int __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; From a54164114b96b4693b42cdb553260eec41ea4393 Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 19 Dec 2012 16:02:34 +0900 Subject: [PATCH 06/25] tracing: Add checks if tr->buffer is NULL in tracing_reset{_online_cpus} max_tr->buffer could be NULL in the tracing_reset{_online_cpus}. In this case, a NULL pointer dereference happens, so we should return immediately from these functions. Note, the current code does not call tracing_reset*() with max_tr when its buffer is NULL, but future code will. This patch is needed to prevent the future code from crashing. Link: http://lkml.kernel.org/r/20121219070234.31200.93863.stgit@liselsia Signed-off-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8b7c626f3fd6..72b171b90e558 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -922,6 +922,9 @@ void tracing_reset(struct trace_array *tr, int cpu) { struct ring_buffer *buffer = tr->buffer; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ @@ -936,6 +939,9 @@ void tracing_reset_online_cpus(struct trace_array *tr) struct ring_buffer *buffer = tr->buffer; int cpu; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ From 84c6cf0db6a00601eb43cfc08244a398ffb0894c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Dec 2012 21:43:52 -0500 Subject: [PATCH 07/25] tracing: Remove unneeded check of max_tr->buffer before tracing_reset There's now a check in tracing_reset_online_cpus() if the buffer is allocated or NULL. No need to do a check before calling it with max_tr. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 72b171b90e558..d62248dfda760 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4040,8 +4040,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, * Reset the buffer so that it doesn't have incomparable timestamps. */ tracing_reset_online_cpus(&global_trace); - if (max_tr.buffer) - tracing_reset_online_cpus(&max_tr); + tracing_reset_online_cpus(&max_tr); mutex_unlock(&trace_types_lock); From 0f1ac8fd254b6c3e77950a1c4ee67be5dc88f7e0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 15 Jan 2013 22:11:19 -0500 Subject: [PATCH 08/25] tracing/lockdep: Disable lockdep first in entering NMI When function tracing with either debug locks enabled or tracing preempt disabled, the add_preempt_count() is traced. This is an issue with lockdep and function tracing. As function tracing can disable interrupts, and lockdep records that change, lockdep may not be able to handle this recursion if it happens from an NMI context. The first thing that an NMI does is: #define nmi_enter() \ do { \ ftrace_nmi_enter(); \ BUG_ON(in_nmi()); \ add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ lockdep_off(); \ rcu_nmi_enter(); \ trace_hardirq_enter(); \ } while (0) When the add_preempt_count() is traced, and the tracing callback disables interrupts, it will jump into the lockdep code. There's some places in lockdep that can't handle this re-entrance, and causes lockdep to fail. As the lockdep_off() (and lockdep_on) is a simple: void lockdep_off(void) { current->lockdep_recursion++; } and is never traced, it can be called first in nmi_enter() and lockdep_on() last in nmi_exit(). Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/linux/hardirq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 624ef3f45c8ef..57bfdce8fb907 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -180,10 +180,10 @@ extern void irq_exit(void); #define nmi_enter() \ do { \ + lockdep_off(); \ ftrace_nmi_enter(); \ BUG_ON(in_nmi()); \ add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ - lockdep_off(); \ rcu_nmi_enter(); \ trace_hardirq_enter(); \ } while (0) @@ -192,10 +192,10 @@ extern void irq_exit(void); do { \ trace_hardirq_exit(); \ rcu_nmi_exit(); \ - lockdep_on(); \ BUG_ON(!in_nmi()); \ sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ + lockdep_on(); \ } while (0) #endif /* LINUX_HARDIRQ_H */ From 8741db532e86da2e54f05be751bfe1922ca63d57 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 16 Jan 2013 10:49:37 -0500 Subject: [PATCH 09/25] tracing/fgraph: Add max_graph_depth to limit function_graph depth Add the file max_graph_depth to the debug tracing directory that lets the user define the depth of the function graph. A very useful operation is to set the depth to 1. Then it traces only the first function that is called when entering the kernel. This can be used to determine what system operations interrupt a process. For example, to work on NOHZ processes (single tasks running without a timer tick), if any interrupt goes off and preempts that task, this code will show it happening. # cd /sys/kernel/debug/tracing # echo 1 > max_graph_depth # echo function_graph > current_tracer # cat per_cpu/cpu//trace Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 60 +++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4edb4b74eb7e7..7008d2e13cf2c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -47,6 +47,8 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 +static unsigned int max_depth; + static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, @@ -250,8 +252,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) + if ((!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) || + (max_depth && trace->depth >= max_depth)) return 0; local_irq_save(flags); @@ -1457,6 +1460,59 @@ static struct tracer graph_trace __read_mostly = { #endif }; + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + max_depth = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ + int n; + + n = sprintf(buf, "%d\n", max_depth); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { + .open = tracing_open_generic, + .write = graph_depth_write, + .read = graph_depth_read, + .llseek = generic_file_llseek, +}; + +static __init int init_graph_debugfs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("max_graph_depth", 0644, d_tracer, + NULL, &graph_depth_fops); + + return 0; +} +fs_initcall(init_graph_debugfs); + static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); From 06aeaaeabf69da4a3e86df532425640f51b01cef Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Sep 2012 17:15:17 +0900 Subject: [PATCH 10/25] ftrace: Move ARCH_SUPPORTS_FTRACE_SAVE_REGS in Kconfig Move SAVE_REGS support flag into Kconfig and rename it to CONFIG_DYNAMIC_FTRACE_WITH_REGS. This also introduces CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS which indicates the architecture depending part of ftrace has a code that saves full registers. On the other hand, CONFIG_DYNAMIC_FTRACE_WITH_REGS indicates the code is enabled. Link: http://lkml.kernel.org/r/20120928081516.3560.72534.stgit@ltc138.sdl.hitachi.co.jp Cc: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 1 + arch/x86/include/asm/ftrace.h | 1 - include/linux/ftrace.h | 6 +++--- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/ftrace.c | 6 +++--- kernel/trace/trace_selftest.c | 2 +- 6 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 79795af598105..996ccecc694ce 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -44,6 +44,7 @@ config X86 select HAVE_FENTRY if X86_64 select HAVE_C_RECORDMCOUNT select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9a25b522d3779..86cb51e1ca96a 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -44,7 +44,6 @@ #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 -#define ARCH_SUPPORTS_FTRACE_SAVE_REGS #endif #ifndef __ASSEMBLY__ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 92691d85c3206..e5ca8ef50e9bf 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -74,7 +74,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, * SAVE_REGS - The ftrace_ops wants regs saved at each function called * and passed to the callback. If this flag is set, but the * architecture does not support passing regs - * (ARCH_SUPPORTS_FTRACE_SAVE_REGS is not defined), then the + * (CONFIG_DYNAMIC_FTRACE_WITH_REGS is not defined), then the * ftrace_ops will fail to register, unless the next flag * is set. * SAVE_REGS_IF_SUPPORTED - This is the same as SAVE_REGS, but if the @@ -418,7 +418,7 @@ void ftrace_modify_all_code(int command); #endif #ifndef FTRACE_REGS_ADDR -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS # define FTRACE_REGS_ADDR ((unsigned long)ftrace_regs_caller) #else # define FTRACE_REGS_ADDR FTRACE_ADDR @@ -480,7 +480,7 @@ extern int ftrace_make_nop(struct module *mod, */ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS /** * ftrace_modify_call - convert from one addr to another (no nop) * @rec: the mcount call site record diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d89335a485f7..cdc9d284d24e6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE help See Documentation/trace/ftrace-design.txt +config HAVE_DYNAMIC_FTRACE_WITH_REGS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -434,6 +437,11 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config DYNAMIC_FTRACE_WITH_REGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_REGS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 41473b4ad7a47..6e34dc162fe1f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -337,7 +337,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) return -EINVAL; -#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* * If the ftrace_ops specifies SAVE_REGS, then it only can be used * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. @@ -4143,8 +4143,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * Archs are to support both the regs and ftrace_ops at the same time. * If they support ftrace_ops, it is assumed they support regs. * If call backs want to use regs, they must either check for regs - * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. - * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still * set the ARCH_SUPPORT_FTARCE_OPS. */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 47623169a815d..6c62d58d8e875 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -568,7 +568,7 @@ trace_selftest_function_regs(void) int ret; int supported = 0; -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS supported = 1; #endif From e7dbfe349d12eabb7783b117e0c115f6f3d9ef9e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Sep 2012 17:15:20 +0900 Subject: [PATCH 11/25] kprobes/x86: Move ftrace-based kprobe code into kprobes-ftrace.c Split ftrace-based kprobes code from kprobes, and introduce CONFIG_(HAVE_)KPROBES_ON_FTRACE Kconfig flags. For the cleanup reason, this also moves kprobe_ftrace check into skip_singlestep. Link: http://lkml.kernel.org/r/20120928081520.3560.25624.stgit@ltc138.sdl.hitachi.co.jp Cc: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/Kconfig | 12 +++++ arch/x86/Kconfig | 1 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kprobes-common.h | 11 ++++ arch/x86/kernel/kprobes-ftrace.c | 93 ++++++++++++++++++++++++++++++++ arch/x86/kernel/kprobes.c | 70 +----------------------- include/linux/kprobes.h | 12 +---- kernel/kprobes.c | 8 +-- 8 files changed, 125 insertions(+), 83 deletions(-) create mode 100644 arch/x86/kernel/kprobes-ftrace.c diff --git a/arch/Kconfig b/arch/Kconfig index 7f8f281f2585e..97fb7d0365d15 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -76,6 +76,15 @@ config OPTPROBES depends on KPROBES && HAVE_OPTPROBES depends on !PREEMPT +config KPROBES_ON_FTRACE + def_bool y + depends on KPROBES && HAVE_KPROBES_ON_FTRACE + depends on DYNAMIC_FTRACE_WITH_REGS + help + If function tracer is enabled and the arch supports full + passing of pt_regs to function tracing, then kprobes can + optimize on top of function tracing. + config UPROBES bool "Transparent user-space probes (EXPERIMENTAL)" depends on UPROBE_EVENT && PERF_EVENTS @@ -158,6 +167,9 @@ config HAVE_KRETPROBES config HAVE_OPTPROBES bool +config HAVE_KPROBES_ON_FTRACE + bool + config HAVE_NMI_WATCHDOG bool # diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 996ccecc694ce..be8b2b3ab9797 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -40,6 +40,7 @@ config X86 select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES select HAVE_OPTPROBES + select HAVE_KPROBES_ON_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FENTRY if X86_64 select HAVE_C_RECORDMCOUNT diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 34e923a537628..cc5d31f8830ca 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -67,6 +67,7 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_OPTPROBES) += kprobes-opt.o +obj-$(CONFIG_KPROBES_ON_FTRACE) += kprobes-ftrace.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h index 3230b68ef29ae..2e9d4b5af036a 100644 --- a/arch/x86/kernel/kprobes-common.h +++ b/arch/x86/kernel/kprobes-common.h @@ -99,4 +99,15 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig return addr; } #endif + +#ifdef CONFIG_KPROBES_ON_FTRACE +extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb); +#else +static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + return 0; +} +#endif #endif diff --git a/arch/x86/kernel/kprobes-ftrace.c b/arch/x86/kernel/kprobes-ftrace.c new file mode 100644 index 0000000000000..70a81c7aa0a70 --- /dev/null +++ b/arch/x86/kernel/kprobes-ftrace.c @@ -0,0 +1,93 @@ +/* + * Dynamic Ftrace based Kprobes Optimization + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Hitachi Ltd., 2012 + */ +#include +#include +#include +#include +#include + +#include "kprobes-common.h" + +static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); + return 1; +} + +int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + if (kprobe_ftrace(p)) + return __skip_singlestep(p, regs, kcb); + else + return 0; +} + +/* Ftrace callback handler for kprobes */ +void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + unsigned long flags; + + /* Disable irq for emulating a breakpoint and avoiding preempt */ + local_irq_save(flags); + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto end; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ + regs->ip = ip + sizeof(kprobe_opcode_t); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, regs)) + __skip_singlestep(p, regs, kcb); + /* + * If pre_handler returns !0, it sets regs->ip and + * resets current kprobe. + */ + } +end: + local_irq_restore(flags); +} + +int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = -1; + return 0; +} diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 57916c0d3cf6c..18114bfb10f34 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -541,23 +541,6 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb return 1; } -#ifdef KPROBES_CAN_USE_FTRACE -static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); -} -#endif - /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -616,13 +599,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) } else if (kprobe_running()) { p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { -#ifdef KPROBES_CAN_USE_FTRACE - if (kprobe_ftrace(p)) { - skip_singlestep(p, regs, kcb); - return 1; - } -#endif - setup_singlestep(p, regs, kcb, 0); + if (!skip_singlestep(p, regs, kcb)) + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ @@ -1075,50 +1053,6 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -#ifdef KPROBES_CAN_USE_FTRACE -/* Ftrace callback handler for kprobes */ -void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) -{ - struct kprobe *p; - struct kprobe_ctlblk *kcb; - unsigned long flags; - - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); - - p = get_kprobe((kprobe_opcode_t *)ip); - if (unlikely(!p) || kprobe_disabled(p)) - goto end; - - kcb = get_kprobe_ctlblk(); - if (kprobe_running()) { - kprobes_inc_nmissed_count(p); - } else { - /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ - regs->ip = ip + sizeof(kprobe_opcode_t); - - __this_cpu_write(current_kprobe, p); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, regs)) - skip_singlestep(p, regs, kcb); - /* - * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe. - */ - } -end: - local_irq_restore(flags); -} - -int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) -{ - p->ainsn.insn = NULL; - p->ainsn.boostable = -1; - return 0; -} -#endif - int __init arch_init_kprobes(void) { return arch_init_optprobes(); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 23755ba42abc4..4b6ef4d33cc26 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -49,16 +49,6 @@ #define KPROBE_REENTER 0x00000004 #define KPROBE_HIT_SSDONE 0x00000008 -/* - * If function tracer is enabled and the arch supports full - * passing of pt_regs to function tracing, then kprobes can - * optimize on top of function tracing. - */ -#if defined(CONFIG_FUNCTION_TRACER) && defined(ARCH_SUPPORTS_FTRACE_SAVE_REGS) \ - && defined(ARCH_SUPPORTS_KPROBES_ON_FTRACE) -# define KPROBES_CAN_USE_FTRACE -#endif - /* Attach to insert probes on any functions which should be ignored*/ #define __kprobes __attribute__((__section__(".kprobes.text"))) @@ -316,7 +306,7 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table, #endif #endif /* CONFIG_OPTPROBES */ -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 098f396aa4098..f423c3ef4a829 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -919,7 +919,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) } #endif /* CONFIG_OPTPROBES */ -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS, @@ -964,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); } -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) #define arm_kprobe_ftrace(p) do {} while (0) #define disarm_kprobe_ftrace(p) do {} while (0) @@ -1414,12 +1414,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, */ ftrace_addr = ftrace_location((unsigned long)p->addr); if (ftrace_addr) { -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE /* Given address is not on the instruction boundary */ if ((unsigned long)p->addr != ftrace_addr) return -EILSEQ; p->flags |= KPROBE_FLAG_FTRACE; -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ return -EINVAL; #endif } From f684199f5de805ac50ea5bdec2b082882586a777 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Sep 2012 17:15:22 +0900 Subject: [PATCH 12/25] kprobes/x86: Move kprobes stuff under arch/x86/kernel/kprobes/ Move arch-dep kprobes stuff under arch/x86/kernel/kprobes. Link: http://lkml.kernel.org/r/20120928081522.3560.75469.stgit@ltc138.sdl.hitachi.co.jp Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Masami Hiramatsu [ fixed whitespace and s/__attribute__((packed))/__packed/ ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/Makefile | 4 +--- arch/x86/kernel/kprobes/Makefile | 7 +++++++ arch/x86/kernel/{kprobes-common.h => kprobes/common.h} | 0 arch/x86/kernel/{kprobes.c => kprobes/core.c} | 6 +++--- arch/x86/kernel/{kprobes-ftrace.c => kprobes/ftrace.c} | 2 +- arch/x86/kernel/{kprobes-opt.c => kprobes/opt.c} | 2 +- 6 files changed, 13 insertions(+), 8 deletions(-) create mode 100644 arch/x86/kernel/kprobes/Makefile rename arch/x86/kernel/{kprobes-common.h => kprobes/common.h} (100%) rename arch/x86/kernel/{kprobes.c => kprobes/core.c} (99%) rename arch/x86/kernel/{kprobes-ftrace.c => kprobes/ftrace.c} (98%) rename arch/x86/kernel/{kprobes-opt.c => kprobes/opt.c} (99%) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cc5d31f8830ca..ac3b3d0028330 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -65,9 +65,7 @@ obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o -obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_OPTPROBES) += kprobes-opt.o -obj-$(CONFIG_KPROBES_ON_FTRACE) += kprobes-ftrace.o +obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile new file mode 100644 index 0000000000000..0d33169cc1a27 --- /dev/null +++ b/arch/x86/kernel/kprobes/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for kernel probes +# + +obj-$(CONFIG_KPROBES) += core.o +obj-$(CONFIG_OPTPROBES) += opt.o +obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes/common.h similarity index 100% rename from arch/x86/kernel/kprobes-common.h rename to arch/x86/kernel/kprobes/common.h diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes/core.c similarity index 99% rename from arch/x86/kernel/kprobes.c rename to arch/x86/kernel/kprobes/core.c index 18114bfb10f34..e124554598eee 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes/core.c @@ -58,7 +58,7 @@ #include #include -#include "kprobes-common.h" +#include "common.h" void jprobe_return_end(void); @@ -78,7 +78,7 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); * Groups, and some special opcodes can not boost. * This is non-const and volatile to keep gcc from statically * optimizing it out, as variable_test_bit makes gcc think only - * *(unsigned long*) is used. + * *(unsigned long*) is used. */ static volatile u32 twobyte_is_boostable[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ @@ -117,7 +117,7 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) struct __arch_relative_insn { u8 op; s32 raddr; - } __attribute__((packed)) *insn; + } __packed *insn; insn = (struct __arch_relative_insn *)from; insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); diff --git a/arch/x86/kernel/kprobes-ftrace.c b/arch/x86/kernel/kprobes/ftrace.c similarity index 98% rename from arch/x86/kernel/kprobes-ftrace.c rename to arch/x86/kernel/kprobes/ftrace.c index 70a81c7aa0a70..23ef5c556f061 100644 --- a/arch/x86/kernel/kprobes-ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -23,7 +23,7 @@ #include #include -#include "kprobes-common.h" +#include "common.h" static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes/opt.c similarity index 99% rename from arch/x86/kernel/kprobes-opt.c rename to arch/x86/kernel/kprobes/opt.c index c5e410eed4039..76dc6f0957246 100644 --- a/arch/x86/kernel/kprobes-opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -37,7 +37,7 @@ #include #include -#include "kprobes-common.h" +#include "common.h" unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) { From b000c8065a92b0fe0e1694f41b2c8d8ba7b7b1ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Jan 2013 10:31:20 -0500 Subject: [PATCH 13/25] tracing: Remove the extra 4 bytes of padding in events Due to a userspace issue with PowerTop v2beta, which hardcoded the offset of event fields that it was using, it broke when we removed the Big Kernel Lock counter from the event header. (commit e6e1e2593 "tracing: Remove lock_depth from event entry") Because this broke userspace, it was determined that we must keep those 4 bytes around. (commit a3a4a5acd "Regression: partial revert "tracing: Remove lock_depth from event entry"") This unfortunately wastes space in the ring buffer. 4 bytes per event, where a lot of events are just 24 bytes. That's 16% of the buffer wasted. A million events will add 4 megs of white space into the buffer. It was later noticed that PowerTop v2beta could not work on systems where the kernel was 64 bit but the userspace was 32 bits. The reason was because the offsets are different between the two and the hard coded offset of one would not work with the other. With PowerTop v2 final, it implemented the same interface that both perf and trace-cmd use. That is, it reads the format file of the event to find the offsets of the fields it needs. This fixes the problem with running powertop on a 32 bit userspace running on a 64 bit kernel. It also no longer requires the 4 byte padding. As PowerTop v2 has been out for a while, and is included in all major distributions, it is time that we can safely remove the 4 bytes of padding. Users of PowerTop v2beta should upgrade to PowerTop v2 final. Cc: Linus Torvalds Acked-by: Arjan van de Ven Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 - kernel/trace/trace.c | 1 - kernel/trace/trace_events.c | 1 - 3 files changed, 3 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 43ef8b6cce7fb..6f8d0b77006b2 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -49,7 +49,6 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; - int padding; }; #define FTRACE_MAX_EVENT \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d62248dfda760..a387bd271e711 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1173,7 +1173,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 880073d0b946f..57e9b284250c4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, padding); return ret; } From 0a71e4c6d749d06f52e75a406fc9046924fcfcc1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 12:06:56 -0500 Subject: [PATCH 14/25] tracing: Remove trace.h header from trace_clock.c As trace_clock is used by other things besides tracing, and it does not require anything from trace.h, it is best not to include the header file in trace_clock.c. Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cbb7..22b638b28e487 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -21,8 +21,6 @@ #include #include -#include "trace.h" - /* * trace_clock_local(): the simplest and least coherent tracing clock. * From 34600f0e9c33c9cd48ae87448205f51332b7d5a0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 13:35:11 -0500 Subject: [PATCH 15/25] tracing: Fix race with max_tr and changing tracers There's a race condition between the setting of a new tracer and the update of the max trace buffers (the swap). When a new tracer is added, it sets current_trace to nop_trace before disabling the old tracer. At this moment, if the old tracer uses update_max_tr(), the update may trigger the warning against !current_trace->use_max-tr, as nop_trace doesn't have that set. As update_max_tr() requires that interrupts be disabled, we can add a check to see if current_trace == nop_trace and bail if it does. Then when disabling the current_trace, set it to nop_trace and run synchronize_sched(). This will make sure all calls to update_max_tr() have completed (it was called with interrupts disabled). As a clean up, this commit also removes shrinking and recreating the max_tr buffer if the old and new tracers both have use_max_tr set. The old way use to always shrink the buffer, and then expand it for the next tracer. This is a waste of time. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a387bd271e711..d2a658349ca16 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -709,10 +709,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + + /* If we disabled the tracer, stop now */ + if (current_trace == &nop_trace) return; - } + + if (WARN_ON_ONCE(!current_trace->use_max_tr)) + return; + arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; @@ -3185,6 +3189,7 @@ static int tracing_set_tracer(const char *buf) static struct trace_option_dentry *topts; struct trace_array *tr = &global_trace; struct tracer *t; + bool had_max_tr; int ret = 0; mutex_lock(&trace_types_lock); @@ -3211,7 +3216,19 @@ static int tracing_set_tracer(const char *buf) trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); - if (current_trace && current_trace->use_max_tr) { + + had_max_tr = current_trace && current_trace->use_max_tr; + current_trace = &nop_trace; + + if (had_max_tr && !t->use_max_tr) { + /* + * We need to make sure that the update_max_tr sees that + * current_trace changed to nop_trace to keep it from + * swapping the buffers after we resize it. + * The update_max_tr is called from interrupts disabled + * so a synchronized_sched() is sufficient. + */ + synchronize_sched(); /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and @@ -3222,10 +3239,8 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = &nop_trace; - topts = create_trace_option_files(t); - if (t->use_max_tr) { + if (t->use_max_tr && !had_max_tr) { /* we need to make per cpu buffer sizes equivalent */ ret = resize_buffer_duplicate_size(&max_tr, &global_trace, RING_BUFFER_ALL_CPUS); From 05cbbf643b8eea1be21082c53cdb856d1dc6d765 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 23:35:11 -0500 Subject: [PATCH 16/25] tracing: Fix selftest function recursion accounting The test that checks function recursion does things differently if the arch does not support all ftrace features. But that really doesn't make a difference with how the test runs, and either way the count variable should be 2 at the end. Currently the test wrongly fails for archs that don't support all the ftrace features. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 6c62d58d8e875..adb008a0136f8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -452,7 +452,6 @@ trace_selftest_function_recursion(void) char *func_name; int len; int ret; - int cnt; /* The previous test PASSED */ pr_cont("PASSED\n"); @@ -510,19 +509,10 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_recsafe_probe); - /* - * If arch supports all ftrace features, and no other task - * was on the list, we should be fine. - */ - if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) - cnt = 2; /* Should have recursed */ - else - cnt = 1; - ret = -1; - if (trace_selftest_recursion_cnt != cnt) { - pr_cont("*callback not called expected %d times (%d)* ", - cnt, trace_selftest_recursion_cnt); + if (trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called expected 2 times (%d)* ", + trace_selftest_recursion_cnt); goto out; } From 6350379452ccaeaa71734adf57dec2ebc9207849 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 16:58:56 -0400 Subject: [PATCH 17/25] ftrace: Fix global function tracers that are not recursion safe If one of the function tracers set by the global ops is not recursion safe, it can still be called directly without the added recursion supplied by the ftrace infrastructure. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6e34dc162fe1f..789cbec24e813 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -221,10 +221,24 @@ static void update_global_ops(void) * registered callers. */ if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) + ftrace_global_list->next == &ftrace_list_end) { func = ftrace_global_list->func; - else + /* + * As we are calling the function directly. + * If it does not have recursion protection, + * the function_trace_op needs to be updated + * accordingly. + */ + if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + else + global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; + } else { func = ftrace_global_list_func; + /* The list has its own recursion protection. */ + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + } + /* If we filter on pids, update to use the pid function */ if (!list_empty(&ftrace_pids)) { From 9640388b63556b4cfecbb5aaf91a5c99d272f429 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:01:20 -0400 Subject: [PATCH 18/25] ftrace: Fix function tracing recursion self test The function tracing recursion self test should not crash the machine if the resursion test fails. If it detects that the function tracing is recursing when it should not be, then bail, don't go into an infinite recursive loop. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index adb008a0136f8..51c819c12c291 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip, * The ftrace infrastructure should provide the recursion * protection. If not, this will crash the kernel! */ - trace_selftest_recursion_cnt++; + if (trace_selftest_recursion_cnt++ > 10) + return; DYN_FTRACE_TEST_NAME(); } From 0a016409e42f273415f8225ddf2c58eb2df88034 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:03:03 -0400 Subject: [PATCH 19/25] ftrace: Optimize the function tracer list loop There is lots of places that perform: op = rcu_dereference_raw(ftrace_control_list); while (op != &ftrace_list_end) { Add a helper macro to do this, and also optimize for a single entity. That is, gcc will optimize a loop for either no iterations or more than one iteration. But usually only a single callback is registered to the function tracer, thus the optimized case should be a single pass. to do this we now do: op = rcu_dereference_raw(list); do { [...] } while (likely(op = rcu_dereference_raw((op)->next)) && unlikely((op) != &ftrace_list_end)); An op is always registered (ftrace_list_end when no callbacks is registered), thus when a single callback is registered, the link list looks like: top => callback => ftrace_list_end => NULL. The likely(op = op->next) still must be performed due to the race of removing the callback, where the first op assignment could equal ftrace_list_end. In that case, the op->next would be NULL. But this is unlikely (only happens in a race condition when removing the callback). But it is very likely that the next op would be ftrace_list_end, unless more than one callback has been registered. This tells gcc what the most common case is and makes the fast path with the least amount of branches. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 48 +++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 789cbec24e813..1330969d84477 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #endif +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + /** * ftrace_nr_registered_ops - return number of ops registered * @@ -132,15 +152,6 @@ int ftrace_nr_registered_ops(void) return cnt; } -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) @@ -149,11 +160,9 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, return; trace_recursion_set(TRACE_GLOBAL_BIT); - op = rcu_dereference_raw(ftrace_global_list); /*see above*/ - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); /*see above*/ - }; + } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_GLOBAL_BIT); } @@ -4104,14 +4113,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); - op = rcu_dereference_raw(ftrace_control_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_control_list) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_CONTROL_BIT); preempt_enable_notrace(); } @@ -4139,12 +4145,10 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); - op = rcu_dereference_raw(ftrace_ops_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_ops_list) { if (ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); preempt_enable_notrace(); trace_recursion_clear(TRACE_INTERNAL_BIT); } From c29f122cd7fc178b72b1335b1fce0dff2e5c0f5d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:17:59 -0400 Subject: [PATCH 20/25] ftrace: Add context level recursion bit checking Currently for recursion checking in the function tracer, ftrace tests a task_struct bit to determine if the function tracer had recursed or not. If it has, then it will will return without going further. But this leads to races. If an interrupt came in after the bit was set, the functions being traced would see that bit set and think that the function tracer recursed on itself, and would return. Instead add a bit for each context (normal, softirq, irq and nmi). A check of which context the task is in is made before testing the associated bit. Now if an interrupt preempts the function tracer after the previous context has been set, the interrupt functions can still be traced. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 40 +++++++++++++++++++++++++++++++++------- kernel/trace/trace.h | 12 +++++++++--- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1330969d84477..639b6ab1f04c5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -156,14 +156,27 @@ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = TRACE_GLOBAL_NMI_BIT; + + else if (in_irq()) + bit = TRACE_GLOBAL_IRQ_BIT; + else + bit = TRACE_GLOBAL_SIRQ_BIT; + } else + bit = TRACE_GLOBAL_BIT; + + if (unlikely(trace_recursion_test(bit))) return; - trace_recursion_set(TRACE_GLOBAL_BIT); + trace_recursion_set(bit); do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); - trace_recursion_clear(TRACE_GLOBAL_BIT); + trace_recursion_clear(bit); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -4132,14 +4145,27 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; + unsigned int bit; if (function_trace_stop) return; - if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) - return; + if (in_interrupt()) { + if (in_nmi()) + bit = TRACE_INTERNAL_NMI_BIT; + + else if (in_irq()) + bit = TRACE_INTERNAL_IRQ_BIT; + else + bit = TRACE_INTERNAL_SIRQ_BIT; + } else + bit = TRACE_INTERNAL_BIT; + + if (unlikely(trace_recursion_test(bit))) + return; + + trace_recursion_set(bit); - trace_recursion_set(TRACE_INTERNAL_BIT); /* * Some of the ops may be dynamically allocated, * they must be freed after a synchronize_sched(). @@ -4150,7 +4176,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); - trace_recursion_clear(TRACE_INTERNAL_BIT); + trace_recursion_clear(bit); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c75d7988902ca..fe6ccff9cc705 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -299,8 +299,14 @@ struct tracer { /* for function tracing recursion */ #define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) -#define TRACE_CONTROL_BIT (1<<13) +#define TRACE_INTERNAL_NMI_BIT (1<<12) +#define TRACE_INTERNAL_IRQ_BIT (1<<13) +#define TRACE_INTERNAL_SIRQ_BIT (1<<14) +#define TRACE_GLOBAL_BIT (1<<15) +#define TRACE_GLOBAL_NMI_BIT (1<<16) +#define TRACE_GLOBAL_IRQ_BIT (1<<17) +#define TRACE_GLOBAL_SIRQ_BIT (1<<18) +#define TRACE_CONTROL_BIT (1<<19) /* * Abuse of the trace_recursion. @@ -309,7 +315,7 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<13) +#define TRACE_IRQ_BIT (1<<20) #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) From e46cbf75c621725964fe1f6e7013e8bcd86a0e3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:32:25 -0400 Subject: [PATCH 21/25] tracing: Make the trace recursion bits into enums Convert the bits into enums which makes the code a little easier to maintain. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fe6ccff9cc705..5a095d6f088dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -298,15 +298,18 @@ struct tracer { #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) /* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_INTERNAL_NMI_BIT (1<<12) -#define TRACE_INTERNAL_IRQ_BIT (1<<13) -#define TRACE_INTERNAL_SIRQ_BIT (1<<14) -#define TRACE_GLOBAL_BIT (1<<15) -#define TRACE_GLOBAL_NMI_BIT (1<<16) -#define TRACE_GLOBAL_IRQ_BIT (1<<17) -#define TRACE_GLOBAL_SIRQ_BIT (1<<18) -#define TRACE_CONTROL_BIT (1<<19) +enum { + TRACE_INTERNAL_BIT = 11, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + + TRACE_GLOBAL_BIT, + TRACE_GLOBAL_NMI_BIT, + TRACE_GLOBAL_IRQ_BIT, + TRACE_GLOBAL_SIRQ_BIT, + + TRACE_CONTROL_BIT, /* * Abuse of the trace_recursion. @@ -315,11 +318,12 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<20) + TRACE_IRQ_BIT, +}; -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) #define TRACE_PIPE_ALL_CPU -1 From edc15cafcbfa3d73f819cae99885a2e35e4cbce5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:47:21 -0400 Subject: [PATCH 22/25] tracing: Avoid unnecessary multiple recursion checks When function tracing occurs, the following steps are made: If arch does not support a ftrace feature: call internal function (uses INTERNAL bits) which calls... If callback is registered to the "global" list, the list function is called and recursion checks the GLOBAL bits. then this function calls... The function callback, which can use the FTRACE bits to check for recursion. Now if the arch does not suppport a feature, and it calls the global list function which calls the ftrace callback all three of these steps will do a recursion protection. There's no reason to do one if the previous caller already did. The recursion that we are protecting against will go through the same steps again. To prevent the multiple recursion checks, if a recursion bit is set that is higher than the MAX bit of the current check, then we know that the check was made by the previous caller, and we can skip the current check. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 40 ++++------------ kernel/trace/trace.h | 106 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 110 insertions(+), 36 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 639b6ab1f04c5..ce8c3d68292f7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -158,25 +158,15 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, { int bit; - if (in_interrupt()) { - if (in_nmi()) - bit = TRACE_GLOBAL_NMI_BIT; - - else if (in_irq()) - bit = TRACE_GLOBAL_IRQ_BIT; - else - bit = TRACE_GLOBAL_SIRQ_BIT; - } else - bit = TRACE_GLOBAL_BIT; - - if (unlikely(trace_recursion_test(bit))) + bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); + if (bit < 0) return; - trace_recursion_set(bit); do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); - trace_recursion_clear(bit); + + trace_clear_recursion(bit); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -4145,26 +4135,14 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; - unsigned int bit; + int bit; if (function_trace_stop) return; - if (in_interrupt()) { - if (in_nmi()) - bit = TRACE_INTERNAL_NMI_BIT; - - else if (in_irq()) - bit = TRACE_INTERNAL_IRQ_BIT; - else - bit = TRACE_INTERNAL_SIRQ_BIT; - } else - bit = TRACE_INTERNAL_BIT; - - if (unlikely(trace_recursion_test(bit))) - return; - - trace_recursion_set(bit); + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) + return; /* * Some of the ops may be dynamically allocated, @@ -4176,7 +4154,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); - trace_recursion_clear(bit); + trace_clear_recursion(bit); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5a095d6f088dc..c203a51dd4121 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -297,18 +297,49 @@ struct tracer { /* Ring buffer has the 10 LSB bits to count */ #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) -/* for function tracing recursion */ +/* + * For function tracing recursion: + * The order of these bits are important. + * + * When function tracing occurs, the following steps are made: + * If arch does not support a ftrace feature: + * call internal function (uses INTERNAL bits) which calls... + * If callback is registered to the "global" list, the list + * function is called and recursion checks the GLOBAL bits. + * then this function calls... + * The function callback, which can use the FTRACE bits to + * check for recursion. + * + * Now if the arch does not suppport a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ enum { - TRACE_INTERNAL_BIT = 11, - TRACE_INTERNAL_NMI_BIT, - TRACE_INTERNAL_IRQ_BIT, - TRACE_INTERNAL_SIRQ_BIT, + TRACE_FTRACE_BIT = 11, + TRACE_FTRACE_NMI_BIT, + TRACE_FTRACE_IRQ_BIT, + TRACE_FTRACE_SIRQ_BIT, + /* GLOBAL_BITs must be greater than FTRACE_BITs */ TRACE_GLOBAL_BIT, TRACE_GLOBAL_NMI_BIT, TRACE_GLOBAL_IRQ_BIT, TRACE_GLOBAL_SIRQ_BIT, + /* INTERNAL_BITs must be greater than GLOBAL_BITs */ + TRACE_INTERNAL_BIT, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + TRACE_CONTROL_BIT, /* @@ -325,6 +356,71 @@ enum { #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) +#define TRACE_CONTEXT_BITS 4 + +#define TRACE_FTRACE_START TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT +#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ + unsigned int val = current->trace_recursion; + int bit; + + /* A previous recursion check was made */ + if ((val & TRACE_CONTEXT_MASK) > max) + return 0; + + bit = trace_get_context_bit() + start; + if (unlikely(val & (1 << bit))) + return -1; + + val |= 1 << bit; + current->trace_recursion = val; + barrier(); + + return bit; +} + +static __always_inline void trace_clear_recursion(int bit) +{ + unsigned int val = current->trace_recursion; + + if (!bit) + return; + + bit = 1 << bit; + val &= ~bit; + + barrier(); + current->trace_recursion = val; +} + #define TRACE_PIPE_ALL_CPU -1 static inline struct ring_buffer_iter * From 897f68a48b1f8fb6cb7493e1ee37e3ed7f879937 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:52:35 -0400 Subject: [PATCH 23/25] ftrace: Use only the preempt version of function tracing The function tracer had two different versions of function tracing. The disabling of irqs version and the preempt disable version. As function tracing in very intrusive and can cause nasty recursion issues, it has its own recursion protection. But the old method to do this was a flat layer. If it detected that a recursion was happening then it would just return without recording. This made the preempt version (much faster than the irq disabling one) not very useful, because if an interrupt were to occur after the recursion flag was set, the interrupt would not be traced at all, because every function that was traced would think it recursed on itself (due to the context it preempted setting the recursive flag). Now that we have a recursion flag for every context level, we no longer need to worry about that. We can disable preemption, set the current context recursion check bit, and go on. If an interrupt were to come along, it would check its own context bit and happily continue to trace. As the preempt version is faster than the irq disable version, there's no more reason to keep the preempt version around. And the irq disable version still had an issue with missing out on tracing NMI code. Remove the irq disable function tracer version and have the preempt disable version be the default (and only version). Before this patch we had from running: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 12.028 Time: 11.945 Time: 11.925 Time: 11.964 Time: 12.002 Time: 11.910 Time: 11.944 Time: 11.929 Time: 11.941 Time: 11.924 (average: 11.9512) Now we have: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 10.285 Time: 10.407 Time: 10.243 Time: 10.372 Time: 10.380 Time: 10.198 Time: 10.272 Time: 10.354 Time: 10.248 Time: 10.253 (average: 10.3012) a 13.8% savings! Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 61 ++++++++-------------------------- 1 file changed, 14 insertions(+), 47 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8e3ad8082ab70..1c327ef13a9aa 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); } -static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) -{ - struct trace_array *tr = func_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - pc = preempt_count(); - preempt_disable_notrace(); - local_save_flags(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, pc); - - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} - /* Our option */ enum { TRACE_FUNC_OPT_STACK = 0x1, @@ -85,34 +57,34 @@ static struct tracer_flags func_flags; static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) - { struct trace_array *tr = func_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; + unsigned int bit; int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + pc = preempt_count(); + preempt_disable_notrace(); - if (likely(disabled == 1)) { - pc = preempt_count(); + bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + if (bit < 0) + goto out; + + cpu = smp_processor_id(); + data = tr->data[cpu]; + if (!atomic_read(&data->disabled)) { + local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); } + trace_clear_recursion(bit); - atomic_dec(&data->disabled); - local_irq_restore(flags); + out: + preempt_enable_notrace(); } static void @@ -185,11 +157,6 @@ static void tracing_start_function_trace(void) { ftrace_function_enabled = 0; - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - if (func_flags.val & TRACE_FUNC_OPT_STACK) register_ftrace_function(&trace_stack_ops); else From 567cd4da54ff45513d2ca1f0e3cb9ba45b66d6cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 18:33:05 -0400 Subject: [PATCH 24/25] ring-buffer: User context bit recursion checking Using context bit recursion checking, we can help increase the performance of the ring buffer. Before this patch: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 10.285 Time: 10.407 Time: 10.243 Time: 10.372 Time: 10.380 Time: 10.198 Time: 10.272 Time: 10.354 Time: 10.248 Time: 10.253 (average: 10.3012) Now we have: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 9.712 Time: 9.824 Time: 9.861 Time: 9.827 Time: 9.962 Time: 9.905 Time: 9.886 Time: 10.088 Time: 9.861 Time: 9.834 (average: 9.876) a 4% savings! Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 85 +++++++++++++++++++++++++++----------- kernel/trace/trace.h | 13 +++--- 2 files changed, 67 insertions(+), 31 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6ff9cc4658ed7..481e262692810 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2432,41 +2432,76 @@ rb_reserve_next_event(struct ring_buffer *buffer, #ifdef CONFIG_TRACING -#define TRACE_RECURSIVE_DEPTH 16 +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ +static DEFINE_PER_CPU(unsigned int, current_context); -/* Keep this code out of the fast path cache */ -static noinline void trace_recursive_fail(void) +static __always_inline int trace_recursive_lock(void) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); - - printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - trace_recursion_buffer(), - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); + unsigned int val = this_cpu_read(current_context); + int bit; - WARN_ON_ONCE(1); -} - -static inline int trace_recursive_lock(void) -{ - trace_recursion_inc(); + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; - if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) - return 0; + if (unlikely(val & (1 << bit))) + return 1; - trace_recursive_fail(); + val |= (1 << bit); + this_cpu_write(current_context, val); - return -1; + return 0; } -static inline void trace_recursive_unlock(void) +static __always_inline void trace_recursive_unlock(void) { - WARN_ON_ONCE(!trace_recursion_buffer()); + unsigned int val = this_cpu_read(current_context); - trace_recursion_dec(); + val--; + val &= this_cpu_read(current_context); + this_cpu_write(current_context, val); } #else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c203a51dd4121..04a2c7ab1735b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -291,11 +291,6 @@ struct tracer { /* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) - -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) /* * For function tracing recursion: @@ -323,7 +318,13 @@ struct tracer { * caller, and we can skip the current check. */ enum { - TRACE_FTRACE_BIT = 11, + TRACE_BUFFER_BIT, + TRACE_BUFFER_NMI_BIT, + TRACE_BUFFER_IRQ_BIT, + TRACE_BUFFER_SIRQ_BIT, + + /* Start of function recursion bits */ + TRACE_FTRACE_BIT, TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, From 0b07436d95b5404134da4d661fd183eac863513e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 16:58:30 -0500 Subject: [PATCH 25/25] ring-buffer: Remove trace.h from ring_buffer.c ring_buffer.c use to require declarations from trace.h, but these have moved to the generic header files. There's nothing in trace.h that ring_buffer.c requires. There's some headers that trace.h included that ring_buffer.c needs, but it's best that it includes them directly, and not include trace.h. Also, some things may use ring_buffer.c without having tracing configured. This removes the dependency that may come in the future. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 481e262692810..13950d9027cb9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3,8 +3,10 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include +#include #include #include #include @@ -21,7 +23,6 @@ #include #include -#include "trace.h" static void update_pages_handler(struct work_struct *work);