From e404b321dbb2d6e438522b7dce9c1d0c6a8c5275 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Sun, 19 Feb 2012 14:16:07 +0300 Subject: [PATCH 1/9] tracing: Don't print an extra separator of flags If __print_flags() is used after another __print_*() function, the temp seq_file buffer will not be empty on entry, and the delimiter will be printed even though there's just one field. We get something like: |S instead of just: S This is because the length of the temp seq buffer is used to determine if the delimiter is printed or not. But this algorithm fails when the seq buffer is not empty on entry, and the delimiter will be printed because it thinks that a previous field was already printed. Link: http://lkml.kernel.org/r/1329650167-480655-1-git-send-email-avagin@openvz.org Signed-off-by: Andrew Vagin Cc: Ingo Molnar Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff3555942..3efd718984fb 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long mask; const char *str; const char *ret = p->buffer + p->len; - int i; + int i, first = 1; for (i = 0; flag_array[i].name && flags; i++) { @@ -310,8 +310,10 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, str = flag_array[i].name; flags &= ~mask; - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); + else + first = 0; trace_seq_puts(p, str); } From 5b34926114e39e12005031269613d2b13194aeba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Feb 2012 20:37:32 -0500 Subject: [PATCH 2/9] tracing: Don't use p->len field to determine output in __print_*() functions If more than one __print_*() function is used in a tracepoint (__print_flags(), __print_symbols(), etc), then the temp seq buffer will not be zero on entry. Using the temp seq buffer's length to know if data has been printed or not in the current function is incorrect and may produce incorrect results. Currently, no in-tree tracepoint causes this bug, but new ones may be created. Cc: Andrew Vagin Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3efd718984fb..c5a01873567d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -319,7 +319,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, /* check for left over flags */ if (flags) { - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); trace_seq_printf(p, "0x%lx", flags); } @@ -346,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%lx", val); trace_seq_putc(p, 0); @@ -372,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%llx", val); trace_seq_putc(p, 0); From e248491ac283b516958ca9ab62c8e74b6718bca8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:48 +0100 Subject: [PATCH 3/9] ftrace: Add enable/disable ftrace_ops control interface Adding a way to temporarily enable/disable ftrace_ops. The change follows the same way as 'global' ftrace_ops are done. Introducing 2 global ftrace_ops - control_ops and ftrace_control_list which take over all ftrace_ops registered with FTRACE_OPS_FL_CONTROL flag. In addition new per cpu flag called 'disabled' is also added to ftrace_ops to provide the control information for each cpu. When ftrace_ops with FTRACE_OPS_FL_CONTROL is registered, it is set as disabled for all cpus. The ftrace_control_list contains all the registered 'control' ftrace_ops. The control_ops provides function which iterates ftrace_control_list and does the check for 'disabled' flag on current cpu. Adding 3 inline functions: ftrace_function_local_disable/ftrace_function_local_enable - enable/disable the ftrace_ops on current cpu ftrace_function_local_disabled - get disabled ftrace_ops::disabled value for current cpu Link: http://lkml.kernel.org/r/1329317514-8131-2-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 66 ++++++++++++++++++++++++ kernel/trace/ftrace.c | 111 ++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace.h | 2 + 3 files changed, 172 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f33fb3b041c6..64a309d2fbd6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -31,16 +31,33 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); +/* + * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are + * set in the flags member. + * + * ENABLED - set/unset when ftrace_ops is registered/unregistered + * GLOBAL - set manualy by ftrace_ops user to denote the ftrace_ops + * is part of the global tracers sharing the same filter + * via set_ftrace_* debugfs files. + * DYNAMIC - set when ftrace_ops is registered to denote dynamically + * allocated ftrace_ops which need special care + * CONTROL - set manualy by ftrace_ops user to denote the ftrace_ops + * could be controled by following calls: + * ftrace_function_local_enable + * ftrace_function_local_disable + */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, FTRACE_OPS_FL_GLOBAL = 1 << 1, FTRACE_OPS_FL_DYNAMIC = 1 << 2, + FTRACE_OPS_FL_CONTROL = 1 << 3, }; struct ftrace_ops { ftrace_func_t func; struct ftrace_ops *next; unsigned long flags; + int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE struct ftrace_hash *notrace_hash; struct ftrace_hash *filter_hash; @@ -97,6 +114,55 @@ int register_ftrace_function(struct ftrace_ops *ops); int unregister_ftrace_function(struct ftrace_ops *ops); void clear_ftrace_function(void); +/** + * ftrace_function_local_enable - enable controlled ftrace_ops on current cpu + * + * This function enables tracing on current cpu by decreasing + * the per cpu control variable. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline void ftrace_function_local_enable(struct ftrace_ops *ops) +{ + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) + return; + + (*this_cpu_ptr(ops->disabled))--; +} + +/** + * ftrace_function_local_disable - enable controlled ftrace_ops on current cpu + * + * This function enables tracing on current cpu by decreasing + * the per cpu control variable. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline void ftrace_function_local_disable(struct ftrace_ops *ops) +{ + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) + return; + + (*this_cpu_ptr(ops->disabled))++; +} + +/** + * ftrace_function_local_disabled - returns ftrace_ops disabled value + * on current cpu + * + * This function returns value of ftrace_ops::disabled on current cpu. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline int ftrace_function_local_disabled(struct ftrace_ops *ops) +{ + WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL)); + return *this_cpu_ptr(ops->disabled); +} + extern void ftrace_stub(unsigned long a0, unsigned long a1); #else /* !CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d1499e910fe8..f615f974d90e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,6 +62,8 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; @@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { }; static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; +static struct ftrace_ops control_ops; static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); @@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif +static void control_ops_disable_all(struct ftrace_ops *ops) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(ops->disabled, cpu) = 1; +} + +static int control_ops_alloc(struct ftrace_ops *ops) +{ + int __percpu *disabled; + + disabled = alloc_percpu(int); + if (!disabled) + return -ENOMEM; + + ops->disabled = disabled; + control_ops_disable_all(ops); + return 0; +} + +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + static void update_global_ops(void) { ftrace_func_t func; @@ -259,6 +289,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } +static void add_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int first = *list == &ftrace_list_end; + add_ftrace_ops(list, ops); + if (first) + add_ftrace_ops(&ftrace_ops_list, main_ops); +} + +static int remove_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int ret = remove_ftrace_ops(list, ops); + if (!ret && *list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); + return ret; +} + static int __register_ftrace_function(struct ftrace_ops *ops) { if (ftrace_disabled) @@ -270,15 +320,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; + /* We don't support both control and global flags set. */ + if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) + return -EINVAL; + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - int first = ftrace_global_list == &ftrace_list_end; - add_ftrace_ops(&ftrace_global_list, ops); + add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); ops->flags |= FTRACE_OPS_FL_ENABLED; - if (first) - add_ftrace_ops(&ftrace_ops_list, &global_ops); + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (control_ops_alloc(ops)) + return -ENOMEM; + add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else add_ftrace_ops(&ftrace_ops_list, ops); @@ -302,11 +357,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return -EINVAL; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_ops(&ftrace_global_list, ops); - if (!ret && ftrace_global_list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); + ret = remove_ftrace_list_ops(&ftrace_global_list, + &global_ops, ops); if (!ret) ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + ret = remove_ftrace_list_ops(&ftrace_control_list, + &control_ops, ops); + if (!ret) { + /* + * The ftrace_ops is now removed from the list, + * so there'll be no new users. We must ensure + * all current users are done before we free + * the control data. + */ + synchronize_sched(); + control_ops_free(ops); + } } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -3873,6 +3940,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) #endif /* CONFIG_DYNAMIC_FTRACE */ +static void +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op; + + if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) + return; + + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + trace_recursion_set(TRACE_CONTROL_BIT); + op = rcu_dereference_raw(ftrace_control_list); + while (op != &ftrace_list_end) { + if (!ftrace_function_local_disabled(op) && + ftrace_ops_test(op, ip)) + op->func(ip, parent_ip); + + op = rcu_dereference_raw(op->next); + }; + trace_recursion_clear(TRACE_CONTROL_BIT); + preempt_enable_notrace(); +} + +static struct ftrace_ops control_ops = { + .func = ftrace_ops_control_func, +}; + static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b93ecbadad6d..55c6ea00f28a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -288,6 +288,8 @@ struct tracer { /* for function tracing recursion */ #define TRACE_INTERNAL_BIT (1<<11) #define TRACE_GLOBAL_BIT (1<<12) +#define TRACE_CONTROL_BIT (1<<13) + /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function From ceec0b6fc7cd43b38a40c2d40223f9cd0616f0cd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:49 +0100 Subject: [PATCH 4/9] ftrace, perf: Add open/close tracepoint perf registration actions Adding TRACE_REG_PERF_OPEN and TRACE_REG_PERF_CLOSE to differentiate register/unregister from open/close actions. The register/unregister actions are invoked for the first/last tracepoint user when opening/closing the event. The open/close actions are invoked for each tracepoint user when opening/closing the event. Link: http://lkml.kernel.org/r/1329317514-8131-3-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 6 +- kernel/trace/trace_event_perf.c | 116 +++++++++++++++++++++----------- kernel/trace/trace_events.c | 10 ++- kernel/trace/trace_kprobe.c | 6 +- kernel/trace/trace_syscalls.c | 14 ++-- 5 files changed, 101 insertions(+), 51 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index c3da42dd22ba..195e3606ddd7 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -146,6 +146,8 @@ enum trace_reg { TRACE_REG_UNREGISTER, TRACE_REG_PERF_REGISTER, TRACE_REG_PERF_UNREGISTER, + TRACE_REG_PERF_OPEN, + TRACE_REG_PERF_CLOSE, }; struct ftrace_event_call; @@ -157,7 +159,7 @@ struct ftrace_event_class { void *perf_probe; #endif int (*reg)(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); int (*define_fields)(struct ftrace_event_call *); struct list_head *(*get_fields)(struct ftrace_event_call *); struct list_head fields; @@ -165,7 +167,7 @@ struct ftrace_event_class { }; extern int ftrace_event_reg(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); enum { TRACE_EVENT_FL_ENABLED_BIT, diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 19a359d5e6d5..0cfcc37f63de 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -44,23 +44,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return 0; } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, - struct perf_event *p_event) +static int perf_trace_event_reg(struct ftrace_event_call *tp_event, + struct perf_event *p_event) { struct hlist_head __percpu *list; - int ret; + int ret = -ENOMEM; int cpu; - ret = perf_trace_event_perm(tp_event, p_event); - if (ret) - return ret; - p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; - ret = -ENOMEM; - list = alloc_percpu(struct hlist_head); if (!list) goto fail; @@ -83,7 +77,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); if (ret) goto fail; @@ -108,6 +102,69 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, return ret; } +static void perf_trace_event_unreg(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; + + if (--tp_event->perf_refcount > 0) + goto out; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +out: + module_put(tp_event->mod); +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + int ret; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_reg(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_open(p_event); + if (ret) { + perf_trace_event_unreg(p_event); + return ret; + } + + return 0; +} + int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; @@ -130,6 +187,14 @@ int perf_trace_init(struct perf_event *p_event) return ret; } +void perf_trace_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); +} + int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; @@ -154,37 +219,6 @@ void perf_trace_del(struct perf_event *p_event, int flags) hlist_del_rcu(&p_event->hlist_entry); } -void perf_trace_destroy(struct perf_event *p_event) -{ - struct ftrace_event_call *tp_event = p_event->tp_event; - int i; - - mutex_lock(&event_mutex); - if (--tp_event->perf_refcount > 0) - goto out; - - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - - /* - * Ensure our callback won't be called anymore. The buffers - * will be freed after that. - */ - tracepoint_synchronize_unregister(); - - free_percpu(tp_event->perf_events); - tp_event->perf_events = NULL; - - if (!--total_ref_count) { - for (i = 0; i < PERF_NR_CONTEXTS; i++) { - free_percpu(perf_trace_buf[i]); - perf_trace_buf[i] = NULL; - } - } -out: - module_put(tp_event->mod); - mutex_unlock(&event_mutex); -} - __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, struct pt_regs *regs, int *rctxp) { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c212a7f934ec..5138fea37908 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); -int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +int ftrace_event_reg(struct ftrace_event_call *call, + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -170,6 +171,9 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) call->class->perf_probe, call); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; @@ -209,7 +213,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_stop_cmdline_record(); call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; } - call->class->reg(call, TRACE_REG_UNREGISTER); + call->class->reg(call, TRACE_REG_UNREGISTER, NULL); } break; case 1: @@ -218,7 +222,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_start_cmdline_record(); call->flags |= TRACE_EVENT_FL_RECORDED_CMD; } - ret = call->class->reg(call, TRACE_REG_REGISTER); + ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 00d527c945a4..5667f8958cc9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, #endif /* CONFIG_PERF_EVENTS */ static __kprobes -int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) { struct trace_probe *tp = (struct trace_probe *)event->data; @@ -1909,6 +1910,9 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) case TRACE_REG_PERF_UNREGISTER: disable_trace_probe(tp, TP_FLAG_PROFILE); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 43500153dd1e..e23515f51ed4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); @@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call) #endif /* CONFIG_PERF_EVENTS */ static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -664,13 +664,16 @@ static int syscall_enter_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysenter_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; } static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -685,6 +688,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysexit_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; From 489c75c3b333dfda4c8d2b7ad1b00e5da024bfa7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:50 +0100 Subject: [PATCH 5/9] ftrace, perf: Add add/del tracepoint perf registration actions Adding TRACE_REG_PERF_ADD and TRACE_REG_PERF_DEL to handle perf event schedule in/out actions. The add action is invoked for when the perf event is scheduled in, while the del action is invoked when the event is scheduled out. Link: http://lkml.kernel.org/r/1329317514-8131-4-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 ++ kernel/trace/trace_event_perf.c | 4 +++- kernel/trace/trace_events.c | 2 ++ kernel/trace/trace_kprobe.c | 2 ++ kernel/trace/trace_syscalls.c | 4 ++++ 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 195e3606ddd7..2bf677cb2d6f 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -148,6 +148,8 @@ enum trace_reg { TRACE_REG_PERF_UNREGISTER, TRACE_REG_PERF_OPEN, TRACE_REG_PERF_CLOSE, + TRACE_REG_PERF_ADD, + TRACE_REG_PERF_DEL, }; struct ftrace_event_call; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0cfcc37f63de..d72af0b03822 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -211,12 +211,14 @@ int perf_trace_add(struct perf_event *p_event, int flags) list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); - return 0; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); } void perf_trace_del(struct perf_event *p_event, int flags) { + struct ftrace_event_call *tp_event = p_event->tp_event; hlist_del_rcu(&p_event->hlist_entry); + tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5138fea37908..079a93ae8a9d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -173,6 +173,8 @@ int ftrace_event_reg(struct ftrace_event_call *call, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5667f8958cc9..580a05ec926b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1912,6 +1912,8 @@ int kprobe_register(struct ftrace_event_call *event, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e23515f51ed4..96fc73369099 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -666,6 +666,8 @@ static int syscall_enter_register(struct ftrace_event_call *event, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } @@ -690,6 +692,8 @@ static int syscall_exit_register(struct ftrace_event_call *event, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } From e59a0bff3ecf389951e3c9378ddfd00f6448bfaa Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:51 +0100 Subject: [PATCH 6/9] ftrace: Add FTRACE_ENTRY_REG macro to allow event registration Adding FTRACE_ENTRY_REG macro so particular ftrace entries could specify registration function and thus become accesible via perf. This will be used in upcomming patch for function trace. Link: http://lkml.kernel.org/r/1329317514-8131-5-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 ++++ kernel/trace/trace_export.c | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55c6ea00f28a..638476af8d7b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -68,6 +68,10 @@ enum trace_type { #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + #include "trace_entries.h" /* diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31e0ae3..f74de861cde9 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -18,6 +18,14 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM ftrace +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accesible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + /* not needed for this file */ #undef __field_struct #define __field_struct(type, item) @@ -152,13 +160,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef F_printk #define F_printk(fmt, args...) #fmt ", " __stringify(args) -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn)\ \ struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ + .reg = regfn, \ }; \ \ struct ftrace_event_call __used event_##call = { \ @@ -170,4 +179,9 @@ struct ftrace_event_call __used event_##call = { \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ + FTRACE_ENTRY_REG(call, struct_name, etype, \ + PARAMS(tstruct), PARAMS(print), NULL) + #include "trace_entries.h" From ced39002f5ea736b716ae233fb68b26d59783912 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:52 +0100 Subject: [PATCH 7/9] ftrace, perf: Add support to use function tracepoint in perf Adding perf registration support for the ftrace function event, so it is now possible to register it via perf interface. The perf_event struct statically contains ftrace_ops as a handle for function tracer. The function tracer is registered/unregistered in open/close actions. To be efficient, we enable/disable ftrace_ops each time the traced process is scheduled in/out (via TRACE_REG_PERF_(ADD|DELL) handlers). This way tracing is enabled only when the process is running. Intentionally using this way instead of the event's hw state PERF_HES_STOPPED, which would not disable the ftrace_ops. It is now possible to use function trace within perf commands like: perf record -e ftrace:function ls perf stat -e ftrace:function ls Allowed only for root. Link: http://lkml.kernel.org/r/1329317514-8131-6-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/perf_event.h | 3 ++ kernel/trace/trace.h | 11 +++++ kernel/trace/trace_entries.h | 6 ++- kernel/trace/trace_event_perf.c | 86 +++++++++++++++++++++++++++++++++ kernel/trace/trace_export.c | 5 ++ 5 files changed, 109 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..92a056f6d18d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,6 +859,9 @@ struct perf_event { #ifdef CONFIG_EVENT_TRACING struct ftrace_event_call *tp_event; struct event_filter *filter; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops ftrace_ops; +#endif #endif #ifdef CONFIG_CGROUP_PERF diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 638476af8d7b..76a1c5094bbf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -595,6 +595,8 @@ static inline int ftrace_trace_task(struct task_struct *task) static inline int ftrace_is_dead(void) { return 0; } #endif +int ftrace_event_is_function(struct ftrace_event_call *call); + /* * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found @@ -832,4 +834,13 @@ extern const char *__stop___trace_bprintk_fmt[]; FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" +#ifdef CONFIG_PERF_EVENTS +#ifdef CONFIG_FUNCTION_TRACER +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif /* CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_PERF_EVENTS */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 93365907f219..47db7eda0531 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -55,7 +55,7 @@ /* * Function trace entry - function address and parent function address: */ -FTRACE_ENTRY(function, ftrace_entry, +FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, @@ -64,7 +64,9 @@ FTRACE_ENTRY(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + + perf_ftrace_event_register ); /* Function call entry */ diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index d72af0b03822..fdeeb5c49627 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,11 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + /* The ftrace function trace is allowed only for root. */ + if (ftrace_event_is_function(tp_event) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) return 0; @@ -250,3 +255,84 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_entry *entry; + struct hlist_head *head; + struct pt_regs regs; + int rctx; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ + sizeof(u64)) - sizeof(u32)) + + BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + + perf_fetch_caller_regs(®s); + + entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + if (!entry) + return; + + entry->ip = ip; + entry->parent_ip = parent_ip; + + head = this_cpu_ptr(event_function.perf_events); + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + 1, ®s, head); + +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + + ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->func = perf_ftrace_function_call; + return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + return unregister_ftrace_function(ops); +} + +static void perf_ftrace_function_enable(struct perf_event *event) +{ + ftrace_function_local_enable(&event->ftrace_ops); +} + +static void perf_ftrace_function_disable(struct perf_event *event) +{ + ftrace_function_local_disable(&event->ftrace_ops); +} + +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data) +{ + switch (type) { + case TRACE_REG_REGISTER: + case TRACE_REG_UNREGISTER: + break; + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + return 0; + case TRACE_REG_PERF_OPEN: + return perf_ftrace_function_register(data); + case TRACE_REG_PERF_CLOSE: + return perf_ftrace_function_unregister(data); + case TRACE_REG_PERF_ADD: + perf_ftrace_function_enable(data); + return 0; + case TRACE_REG_PERF_DEL: + perf_ftrace_function_disable(data); + return 0; + } + + return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index f74de861cde9..a3dbee6d04cd 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -184,4 +184,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; FTRACE_ENTRY_REG(call, struct_name, etype, \ PARAMS(tstruct), PARAMS(print), NULL) +int ftrace_event_is_function(struct ftrace_event_call *call) +{ + return call == &event_function; +} + #include "trace_entries.h" From 02aa3162edaa166a01d193f80ccde890be8b55da Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:53 +0100 Subject: [PATCH 8/9] ftrace: Allow to specify filter field type for ftrace events Adding FILTER_TRACE_FN event field type for function tracepoint event, so it can be properly recognized within filtering code. Currently all fields of ftrace subsystem events share the common field type FILTER_OTHER. Since the function trace fields need special care within the filtering code we need to recognize it properly, hence adding the FILTER_TRACE_FN event type. Adding filter parameter to the FTRACE_ENTRY macro, to specify the filter field type for the event. Link: http://lkml.kernel.org/r/1329317514-8131-7-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace.h | 23 ++++++++------ kernel/trace/trace_entries.h | 48 ++++++++++++++++++++-------- kernel/trace/trace_events_filter.c | 7 +++- kernel/trace/trace_export.c | 51 ++++++++++++++++-------------- 5 files changed, 83 insertions(+), 47 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2bf677cb2d6f..dd478fc8f9f5 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -245,6 +245,7 @@ enum { FILTER_STATIC_STRING, FILTER_DYN_STRING, FILTER_PTR_STRING, + FILTER_TRACE_FN, }; #define EVENT_STORAGE_SIZE 128 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 76a1c5094bbf..29f93cd434a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,21 +56,23 @@ enum trace_type { #define F_STRUCT(args...) args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ - struct struct_name { \ - struct trace_entry ent; \ - tstruct \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ } #undef TP_ARGS #define TP_ARGS(args...) args #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -826,12 +828,13 @@ extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ __attribute__((__aligned__(4))) event_##call; #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 47db7eda0531..d91eb0541b3a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -66,6 +66,8 @@ FTRACE_ENTRY_REG(function, ftrace_entry, F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + FILTER_TRACE_FN, + perf_ftrace_event_register ); @@ -80,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth) + F_printk("--> %lx (%d)", __entry->func, __entry->depth), + + FILTER_OTHER ); /* Function return entry */ @@ -100,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", __entry->func, __entry->depth, __entry->calltime, __entry->rettime, - __entry->depth) + __entry->depth), + + FILTER_OTHER ); /* @@ -129,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -148,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -171,7 +179,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); FTRACE_ENTRY(user_stack, userstack_entry, @@ -187,7 +197,9 @@ FTRACE_ENTRY(user_stack, userstack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); /* @@ -204,7 +216,9 @@ FTRACE_ENTRY(bprint, bprint_entry, ), F_printk("%08lx fmt:%p", - __entry->ip, __entry->fmt) + __entry->ip, __entry->fmt), + + FILTER_OTHER ); FTRACE_ENTRY(print, print_entry, @@ -217,7 +231,9 @@ FTRACE_ENTRY(print, print_entry, ), F_printk("%08lx %s", - __entry->ip, __entry->buf) + __entry->ip, __entry->buf), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -236,7 +252,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, F_printk("%lx %lx %lx %d %x %x", (unsigned long)__entry->phys, __entry->value, __entry->pc, - __entry->map_id, __entry->opcode, __entry->width) + __entry->map_id, __entry->opcode, __entry->width), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -254,7 +272,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, F_printk("%lx %lx %lx %d %x", (unsigned long)__entry->phys, __entry->virt, __entry->len, - __entry->map_id, __entry->opcode) + __entry->map_id, __entry->opcode), + + FILTER_OTHER ); @@ -274,6 +294,8 @@ FTRACE_ENTRY(branch, trace_branch, F_printk("%u:%s:%s (%u)", __entry->line, - __entry->func, __entry->file, __entry->correct) + __entry->func, __entry->file, __entry->correct), + + FILTER_OTHER ); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 76afaee99dbc..3da3d0ec3584 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -899,6 +899,11 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } +static bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + static bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || @@ -986,7 +991,7 @@ static int init_pred(struct filter_parse_state *ps, fn = filter_pred_strloc; else fn = filter_pred_pchar; - } else { + } else if (!is_function_field(field)) { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); else diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index a3dbee6d04cd..7b46c9bd22ae 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -23,8 +23,10 @@ * function and thus become accesible via perf. */ #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) /* not needed for this file */ #undef __field_struct @@ -52,21 +54,22 @@ #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -struct ____ftrace_##name { \ - tstruct \ -}; \ -static void __always_unused ____ftrace_check_##name(void) \ -{ \ - struct ____ftrace_##name *__entry = NULL; \ - \ - /* force compile-time check on F_printk() */ \ - printk(print); \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -75,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -85,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -99,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ @@ -112,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -120,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __dynamic_array(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - 0, is_signed_type(type), FILTER_OTHER);\ + 0, is_signed_type(type), filter_type);\ if (ret) \ return ret; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ int \ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ struct struct_name field; \ int ret; \ + int filter_type = filter; \ \ tstruct; \ \ @@ -161,7 +165,8 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define F_printk(fmt, args...) #fmt ", " __stringify(args) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn)\ +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ + regfn) \ \ struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ @@ -180,9 +185,9 @@ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ FTRACE_ENTRY_REG(call, struct_name, etype, \ - PARAMS(tstruct), PARAMS(print), NULL) + PARAMS(tstruct), PARAMS(print), filter, NULL) int ftrace_event_is_function(struct ftrace_event_call *call) { From 5500fa51199aee770ce53718853732600543619e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:54 +0100 Subject: [PATCH 9/9] ftrace, perf: Add filter support for function trace event Adding support to filter function trace event via perf interface. It is now possible to use filter interface in the perf tool like: perf record -e ftrace:function --filter="(ip == mm_*)" ls The filter syntax is restricted to the the 'ip' field only, and following operators are accepted '==' '!=' '||', ending up with the filter strings like: ip == f1[, ]f2 ... || ip != f3[, ]f4 ... with comma ',' or space ' ' as a function separator. If the space ' ' is used as a separator, the right side of the assignment needs to be enclosed in double quotes '"', e.g.: perf record -e ftrace:function --filter '(ip == do_execve,sys_*,ext*)' ls perf record -e ftrace:function --filter '(ip == "do_execve,sys_*,ext*")' ls perf record -e ftrace:function --filter '(ip == "do_execve sys_* ext*")' ls The '==' operator adds trace filter with same effect as would be added via set_ftrace_filter file. The '!=' operator adds trace filter with same effect as would be added via set_ftrace_notrace file. The right side of the '!=', '==' operators is list of functions or regexp. to be added to filter separated by space. The '||' operator is used for connecting multiple filter definitions together. It is possible to have more than one '==' and '!=' operators within one filter string. Link: http://lkml.kernel.org/r/1329317514-8131-8-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 +- kernel/trace/ftrace.c | 6 ++ kernel/trace/trace.h | 2 - kernel/trace/trace_event_perf.c | 4 +- kernel/trace/trace_events_filter.c | 165 +++++++++++++++++++++++++++-- 5 files changed, 172 insertions(+), 12 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 64a309d2fbd6..72a6cabb4d5b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -250,6 +250,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); void ftrace_set_global_filter(unsigned char *buf, int len, int reset); void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); +void ftrace_free_filter(struct ftrace_ops *ops); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); @@ -380,9 +381,6 @@ extern void ftrace_enable_daemon(void); #else static inline int skip_trace(unsigned long ip) { return 0; } static inline int ftrace_force_update(void) { return 0; } -static inline void ftrace_set_filter(unsigned char *buf, int len, int reset) -{ -} static inline void ftrace_disable_daemon(void) { } static inline void ftrace_enable_daemon(void) { } static inline void ftrace_release_mod(struct module *mod) {} @@ -406,6 +404,9 @@ static inline int ftrace_text_reserved(void *start, void *end) */ #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) #define ftrace_set_early_filter(ops, buf, enable) do { } while (0) +#define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; }) +#define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; }) +#define ftrace_free_filter(ops) do { } while (0) static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return -ENODEV; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f615f974d90e..867bd1dd2dd0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1186,6 +1186,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); } +void ftrace_free_filter(struct ftrace_ops *ops) +{ + free_ftrace_hash(ops->filter_hash); + free_ftrace_hash(ops->notrace_hash); +} + static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 29f93cd434a5..54faec790bc1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -776,9 +776,7 @@ struct filter_pred { u64 val; struct regex regex; unsigned short *ops; -#ifdef CONFIG_FTRACE_STARTUP_TEST struct ftrace_event_field *field; -#endif int offset; int not; int op; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fdeeb5c49627..fee3752ae8f6 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -298,7 +298,9 @@ static int perf_ftrace_function_register(struct perf_event *event) static int perf_ftrace_function_unregister(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - return unregister_ftrace_function(ops); + int ret = unregister_ftrace_function(ops); + ftrace_free_filter(ops); + return ret; } static void perf_ftrace_function_enable(struct perf_event *event) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3da3d0ec3584..431dba8b7542 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -81,6 +81,7 @@ enum { FILT_ERR_TOO_MANY_PREDS, FILT_ERR_MISSING_FIELD, FILT_ERR_INVALID_FILTER, + FILT_ERR_IP_FIELD_ONLY, }; static char *err_text[] = { @@ -96,6 +97,7 @@ static char *err_text[] = { "Too many terms in predicate expression", "Missing field name and/or value", "Meaningless filter expression", + "Only 'ip' field is supported for function trace", }; struct opstack_op { @@ -991,7 +993,12 @@ static int init_pred(struct filter_parse_state *ps, fn = filter_pred_strloc; else fn = filter_pred_pchar; - } else if (!is_function_field(field)) { + } else if (is_function_field(field)) { + if (strcmp(field->name, "ip")) { + parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); + return -EINVAL; + } + } else { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); else @@ -1338,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, strcpy(pred.regex.pattern, operand2); pred.regex.len = strlen(pred.regex.pattern); - -#ifdef CONFIG_FTRACE_STARTUP_TEST pred.field = field; -#endif return init_pred(ps, field, &pred) ? NULL : &pred; } @@ -1954,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event) __free_filter(filter); } +struct function_filter_data { + struct ftrace_ops *ops; + int first_filter; + int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ + char *str, *sep, **re; + + str = kstrndup(buf, len, GFP_KERNEL); + if (!str) + return NULL; + + /* + * The argv_split function takes white space + * as a separator, so convert ',' into spaces. + */ + while ((sep = strchr(str, ','))) + *sep = ' '; + + re = argv_split(GFP_KERNEL, str, count); + kfree(str); + return re; +} + +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, + int reset, char *re, int len) +{ + int ret; + + if (filter) + ret = ftrace_set_filter(ops, re, len, reset); + else + ret = ftrace_set_notrace(ops, re, len, reset); + + return ret; +} + +static int __ftrace_function_set_filter(int filter, char *buf, int len, + struct function_filter_data *data) +{ + int i, re_cnt, ret; + int *reset; + char **re; + + reset = filter ? &data->first_filter : &data->first_notrace; + + /* + * The 'ip' field could have multiple filters set, separated + * either by space or comma. We first cut the filter and apply + * all pieces separatelly. + */ + re = ftrace_function_filter_re(buf, len, &re_cnt); + if (!re) + return -EINVAL; + + for (i = 0; i < re_cnt; i++) { + ret = ftrace_function_set_regexp(data->ops, filter, *reset, + re[i], strlen(re[i])); + if (ret) + break; + + if (*reset) + *reset = 0; + } + + argv_free(re); + return ret; +} + +static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +{ + struct ftrace_event_field *field = pred->field; + + if (leaf) { + /* + * Check the leaf predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; + + if (strcmp(field->name, "ip")) + return -EINVAL; + } else { + /* + * Check the non leaf predicate for function trace, verify: + * - only '||' is used + */ + if (pred->op != OP_OR) + return -EINVAL; + } + + return 0; +} + +static int ftrace_function_set_filter_cb(enum move_type move, + struct filter_pred *pred, + int *err, void *data) +{ + /* Checking the node is valid for function trace. */ + if ((move != MOVE_DOWN) || + (pred->left != FILTER_PRED_INVALID)) { + *err = ftrace_function_check_pred(pred, 0); + } else { + *err = ftrace_function_check_pred(pred, 1); + if (*err) + return WALK_PRED_ABORT; + + *err = __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); + } + + return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; +} + +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + struct function_filter_data data = { + .first_filter = 1, + .first_notrace = 1, + .ops = &event->ftrace_ops, + }; + + return walk_pred_tree(filter->preds, filter->root, + ftrace_function_set_filter_cb, &data); +} +#else +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ + int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { @@ -1974,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, goto out_unlock; err = create_filter(call, filter_str, false, &filter); - if (!err) - event->filter = filter; + if (err) + goto free_filter; + + if (ftrace_event_is_function(call)) + err = ftrace_function_set_filter(event, filter); else + event->filter = filter; + +free_filter: + if (err || ftrace_event_is_function(call)) __free_filter(filter); out_unlock: