From ec5e099d6e941668d121ea9ca7057f4fa00830b0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:22 -0700
Subject: [PATCH 01/10] perf: optimize perf_fetch_caller_regs

avoid memset in perf_fetch_caller_regs, since it's the critical path of all tracepoints.
It's called from perf_sw_event_sched, perf_event_task_sched_in and all of perf_trace_##call
with this_cpu_ptr(&__perf_regs[..]) which are zero initialized by perpcu init logic and
subsequent call to perf_arch_fetch_caller_regs initializes the same fields on all archs,
so we can safely drop memset from all of the above cases and move it into
perf_ftrace_function_call that calls it with stack allocated pt_regs.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h      | 2 --
 kernel/trace/trace_event_perf.c | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f291275ffd717..e89f7199c2239 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -882,8 +882,6 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo
  */
 static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
-	memset(regs, 0, sizeof(*regs));
-
 	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef4..7a68afca8249e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -316,6 +316,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 
+	memset(&regs, 0, sizeof(regs));
 	perf_fetch_caller_regs(&regs);
 
 	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);

From e93735be6a1898dd9f8de8f55254cc76309777ce Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:23 -0700
Subject: [PATCH 02/10] perf: remove unused __addr variable

now all calls to perf_trace_buf_submit() pass 0 as 4th
argument which will be repurposed in the next patch which will
change the meaning of 1st arg of perf_tp_event() to event_type

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/perf.h         | 7 ++-----
 include/trace/trace_events.h | 3 ---
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/trace/perf.h b/include/trace/perf.h
index 26486fcd74ce4..6f7e378690656 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -20,9 +20,6 @@
 #undef __get_bitmask
 #define __get_bitmask(field) (char *)__get_dynamic_array(field)
 
-#undef __perf_addr
-#define __perf_addr(a)	(__addr = (a))
-
 #undef __perf_count
 #define __perf_count(c)	(__count = (c))
 
@@ -38,7 +35,7 @@ perf_trace_##call(void *__data, proto)					\
 	struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
 	struct trace_event_raw_##call *entry;				\
 	struct pt_regs *__regs;						\
-	u64 __addr = 0, __count = 1;					\
+	u64 __count = 1;						\
 	struct task_struct *__task = NULL;				\
 	struct hlist_head *head;					\
 	int __entry_size;						\
@@ -67,7 +64,7 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
-	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
+	perf_trace_buf_submit(entry, __entry_size, rctx, 0,		\
 		__count, __regs, head, __task);				\
 }
 
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 170c93bbdbb75..80679a9fae65e 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -652,9 +652,6 @@ static inline notrace int trace_event_get_offsets_##call(		\
 #undef TP_fast_assign
 #define TP_fast_assign(args...) args
 
-#undef __perf_addr
-#define __perf_addr(a)	(a)
-
 #undef __perf_count
 #define __perf_count(c)	(c)
 

From 1e1dcd93b468901e114f279c94a0b356adc5e7cd Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:24 -0700
Subject: [PATCH 03/10] perf: split perf_trace_buf_prepare into alloc and
 update parts

split allows to move expensive update of 'struct trace_entry' to later phase.
Repurpose unused 1st argument of perf_tp_event() to indicate event type.

While splitting use temp variable 'rctx' instead of '*rctx' to avoid
unnecessary loads done by the compiler due to -fno-strict-aliasing

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h      |  2 +-
 include/linux/trace_events.h    |  8 +++----
 include/trace/perf.h            |  8 +++----
 kernel/events/core.c            |  6 +++--
 kernel/trace/trace_event_perf.c | 39 +++++++++++++++++----------------
 kernel/trace/trace_kprobe.c     | 10 +++++----
 kernel/trace/trace_syscalls.c   | 13 ++++++-----
 kernel/trace/trace_uprobe.c     |  5 +++--
 8 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e89f7199c2239..eb41b535ef385 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1016,7 +1016,7 @@ static inline bool perf_paranoid_kernel(void)
 }
 
 extern void perf_event_init(void);
-extern void perf_tp_event(u64 addr, u64 count, void *record,
+extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
 			  struct hlist_head *head, int rctx,
 			  struct task_struct *task);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0810f81b6db2d..56f795e6a0930 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -605,15 +605,15 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
-extern void *perf_trace_buf_prepare(int size, unsigned short type,
-				    struct pt_regs **regs, int *rctxp);
+void perf_trace_buf_update(void *record, u16 type);
+void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
 static inline void
-perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
+perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
 		       struct task_struct *task)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
+	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 6f7e378690656..77cd9043b7e47 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -53,8 +53,7 @@ perf_trace_##call(void *__data, proto)					\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	entry = perf_trace_buf_prepare(__entry_size,			\
-			event_call->event.type, &__regs, &rctx);	\
+	entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx);	\
 	if (!entry)							\
 		return;							\
 									\
@@ -64,8 +63,9 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
-	perf_trace_buf_submit(entry, __entry_size, rctx, 0,		\
-		__count, __regs, head, __task);				\
+	perf_trace_buf_submit(entry, __entry_size, rctx,		\
+			      event_call->event.type, __count, __regs,	\
+			      head, __task);				\
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbce52774..d8512883c0a08 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6987,7 +6987,7 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 1;
 }
 
-void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
 		   struct task_struct *task)
 {
@@ -6999,9 +6999,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 		.data = record,
 	};
 
-	perf_sample_data_init(&data, addr, 0);
+	perf_sample_data_init(&data, 0, 0);
 	data.raw = &raw;
 
+	perf_trace_buf_update(record, event_type);
+
 	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 7a68afca8249e..5a927075977fc 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -260,42 +260,43 @@ void perf_trace_del(struct perf_event *p_event, int flags)
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
 
-void *perf_trace_buf_prepare(int size, unsigned short type,
-			     struct pt_regs **regs, int *rctxp)
+void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
 {
-	struct trace_entry *entry;
-	unsigned long flags;
 	char *raw_data;
-	int pc;
+	int rctx;
 
 	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-			"perf buffer not large enough"))
+		      "perf buffer not large enough"))
 		return NULL;
 
-	pc = preempt_count();
-
-	*rctxp = perf_swevent_get_recursion_context();
-	if (*rctxp < 0)
+	*rctxp = rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		return NULL;
 
 	if (regs)
-		*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
-	raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
+		*regs = this_cpu_ptr(&__perf_regs[rctx]);
+	raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+	return raw_data;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
+NOKPROBE_SYMBOL(perf_trace_buf_alloc);
+
+void perf_trace_buf_update(void *record, u16 type)
+{
+	struct trace_entry *entry = record;
+	int pc = preempt_count();
+	unsigned long flags;
 
-	entry = (struct trace_entry *)raw_data;
 	local_save_flags(flags);
 	tracing_generic_entry_update(entry, flags, pc);
 	entry->type = type;
-
-	return raw_data;
 }
-EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
-NOKPROBE_SYMBOL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_update);
 
 #ifdef CONFIG_FUNCTION_TRACER
 static void
@@ -319,13 +320,13 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	memset(&regs, 0, sizeof(regs));
 	perf_fetch_caller_regs(&regs);
 
-	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
+	entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
-	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
+	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
 			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 919e0ddd8fcc3..5546eec0505f7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1149,14 +1149,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1184,14 +1185,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->func = (unsigned long)tk->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e78f364cc192f..b2b6efc083a43 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -587,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-				sys_data->enter_event->event.type, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(rec, size, rctx,
+			      sys_data->enter_event->event.type, 1, regs,
+			      head, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -660,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-				sys_data->exit_event->event.type, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
+			      1, regs, head, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7915142c89e4a..c53485441c88a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	if (hlist_empty(head))
 		goto out;
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		goto out;
 
@@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 		memset(data + len, 0, size - esize - len);
 	}
 
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
  out:
 	preempt_enable();
 }

From 98b5c2c65c2951772a8fc661f50d675e450e8bce Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:25 -0700
Subject: [PATCH 04/10] perf, bpf: allow bpf programs attach to tracepoints

introduce BPF_PROG_TYPE_TRACEPOINT program type and allow it to be attached
to the perf tracepoint handler, which will copy the arguments into
the per-cpu buffer and pass it to the bpf program as its first argument.
The layout of the fields can be discovered by doing
'cat /sys/kernel/debug/tracing/events/sched/sched_switch/format'
prior to the compilation of the program with exception that first 8 bytes
are reserved and not accessible to the program. This area is used to store
the pointer to 'struct pt_regs' which some of the bpf helpers will use:
+---------+
| 8 bytes | hidden 'struct pt_regs *' (inaccessible to bpf program)
+---------+
| N bytes | static tracepoint fields defined in tracepoint/format (bpf readonly)
+---------+
| dynamic | __dynamic_array bytes of tracepoint (inaccessible to bpf yet)
+---------+

Not that all of the fields are already dumped to user space via perf ring buffer
and broken application access it directly without consulting tracepoint/format.
Same rule applies here: static tracepoint fields should only be accessed
in a format defined in tracepoint/format. The order of fields and
field sizes are not an ABI.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/perf.h     | 10 +++++++++-
 include/uapi/linux/bpf.h |  1 +
 kernel/events/core.c     | 13 +++++++++----
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/trace/perf.h b/include/trace/perf.h
index 77cd9043b7e47..a182306eefd7a 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -34,6 +34,7 @@ perf_trace_##call(void *__data, proto)					\
 	struct trace_event_call *event_call = __data;			\
 	struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
 	struct trace_event_raw_##call *entry;				\
+	struct bpf_prog *prog = event_call->prog;			\
 	struct pt_regs *__regs;						\
 	u64 __count = 1;						\
 	struct task_struct *__task = NULL;				\
@@ -45,7 +46,7 @@ perf_trace_##call(void *__data, proto)					\
 	__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
-	if (__builtin_constant_p(!__task) && !__task &&			\
+	if (!prog && __builtin_constant_p(!__task) && !__task &&	\
 				hlist_empty(head))			\
 		return;							\
 									\
@@ -63,6 +64,13 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
+	if (prog) {							\
+		*(struct pt_regs **)entry = __regs;			\
+		if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
+			perf_swevent_put_recursion_context(rctx);	\
+			return;						\
+		}							\
+	}								\
 	perf_trace_buf_submit(entry, __entry_size, rctx,		\
 			      event_call->event.type, __count, __regs,	\
 			      head, __task);				\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23917bb47bf33..70eda5aeb3042 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_KPROBE,
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
+	BPF_PROG_TYPE_TRACEPOINT,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d8512883c0a08..e5ffe97d61662 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6725,12 +6725,13 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-inline void perf_swevent_put_recursion_context(int rctx)
+void perf_swevent_put_recursion_context(int rctx)
 {
 	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
 
 	put_recursion_context(swhash->recursion, rctx);
 }
+EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
@@ -7106,6 +7107,7 @@ static void perf_event_free_filter(struct perf_event *event)
 
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
+	bool is_kprobe, is_tracepoint;
 	struct bpf_prog *prog;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -7114,15 +7116,18 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 	if (event->tp_event->prog)
 		return -EEXIST;
 
-	if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
-		/* bpf programs can only be attached to u/kprobes */
+	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
+	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
+	if (!is_kprobe && !is_tracepoint)
+		/* bpf programs can only be attached to u/kprobe or tracepoint */
 		return -EINVAL;
 
 	prog = bpf_prog_get(prog_fd);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	if (prog->type != BPF_PROG_TYPE_KPROBE) {
+	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
+	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 		/* valid fd, but invalid bpf program type */
 		bpf_prog_put(prog);
 		return -EINVAL;

From 9fd82b610ba3351f05a59c3e9117cfefe82f7751 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:26 -0700
Subject: [PATCH 05/10] bpf: register BPF_PROG_TYPE_TRACEPOINT program type

register tracepoint bpf program type and let it call the same set
of helper functions as BPF_PROG_TYPE_KPROBE

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 45 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e4ffb3ace5fa..3e5ebe3254d28 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -268,7 +268,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg5_type	= ARG_CONST_STACK_SIZE,
 };
 
-static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -295,12 +295,20 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
+	default:
+		return NULL;
+	}
+}
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
 	default:
-		return NULL;
+		return tracing_func_proto(func_id);
 	}
 }
 
@@ -332,9 +340,42 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_perf_event_output:
+	case BPF_FUNC_get_stackid:
+		return NULL;
+	default:
+		return tracing_func_proto(func_id);
+	}
+}
+
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
+		return false;
+	if (type != BPF_READ)
+		return false;
+	if (off % size != 0)
+		return false;
+	return true;
+}
+
+static const struct bpf_verifier_ops tracepoint_prog_ops = {
+	.get_func_proto  = tp_prog_func_proto,
+	.is_valid_access = tp_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list tracepoint_tl = {
+	.ops	= &tracepoint_prog_ops,
+	.type	= BPF_PROG_TYPE_TRACEPOINT,
+};
+
 static int __init register_kprobe_prog_ops(void)
 {
 	bpf_register_prog_type(&kprobe_tl);
+	bpf_register_prog_type(&tracepoint_tl);
 	return 0;
 }
 late_initcall(register_kprobe_prog_ops);

From 9940d67c93b5bb7ddcf862b41b1847cb728186c4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:27 -0700
Subject: [PATCH 06/10] bpf: support bpf_get_stackid() and
 bpf_perf_event_output() in tracepoint programs

needs two wrapper functions to fetch 'struct pt_regs *' to convert
tracepoint bpf context into kprobe bpf context to reuse existing
helper functions

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/stackmap.c    |  2 +-
 kernel/trace/bpf_trace.c | 42 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 21ee41b92e8aa..198f6ace70ecf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -160,6 +160,7 @@ struct bpf_array {
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e5..35114725cf303 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,7 +116,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	return ERR_PTR(err);
 }
 
-static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 {
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e5ebe3254d28..413ec56141801 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -340,12 +340,52 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
+static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+	/*
+	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
+	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
+	 * from there and call the same bpf_perf_event_output() helper
+	 */
+	u64 ctx = *(long *)r1;
+
+	return bpf_perf_event_output(ctx, r2, index, r4, size);
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
+	.func		= bpf_perf_event_output_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
+static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	u64 ctx = *(long *)r1;
+
+	return bpf_get_stackid(ctx, r2, r3, r4, r5);
+}
+
+static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
+	.func		= bpf_get_stackid_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
+		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
-		return NULL;
+		return &bpf_get_stackid_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}

From 32bbe0078afe86a8bf4c67c6b3477781b15e94dc Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:28 -0700
Subject: [PATCH 07/10] bpf: sanitize bpf tracepoint access

during bpf program loading remember the last byte of ctx access
and at the time of attaching the program to tracepoint check that
the program doesn't access bytes beyond defined in tracepoint fields

This also disallows access to __dynamic_array fields, but can be
relaxed in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  1 +
 include/linux/trace_events.h |  1 +
 kernel/bpf/verifier.c        |  6 +++++-
 kernel/events/core.c         |  8 ++++++++
 kernel/trace/trace_events.c  | 18 ++++++++++++++++++
 5 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 198f6ace70ecf..b2365a6eba3dc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -131,6 +131,7 @@ struct bpf_prog_type_list {
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
+	u32 max_ctx_offset;
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 56f795e6a0930..fe6441203b59f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type,
 			      int is_signed, int filter_type);
 extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
+extern int trace_event_get_offsets(struct trace_event_call *call);
 
 #define is_signed_type(type)	(((type)(-1)) < (type)1)
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e9b771f..58792fed5678b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
 			    enum bpf_access_type t)
 {
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t))
+	    env->prog->aux->ops->is_valid_access(off, size, t)) {
+		/* remember the offset of last byte accessed in ctx */
+		if (env->prog->aux->max_ctx_offset < off + size)
+			env->prog->aux->max_ctx_offset = off + size;
 		return 0;
+	}
 
 	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5ffe97d61662..9a01019ff7c83 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7133,6 +7133,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	if (is_tracepoint) {
+		int off = trace_event_get_offsets(event->tp_event);
+
+		if (prog->aux->max_ctx_offset > off) {
+			bpf_prog_put(prog);
+			return -EACCES;
+		}
+	}
 	event->tp_event->prog = prog;
 
 	return 0;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771e..ced963049e0aa 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call)
 	}
 }
 
+/*
+ * run-time version of trace_event_get_offsets_<call>() that returns the last
+ * accessible offset of trace fields excluding __dynamic_array bytes
+ */
+int trace_event_get_offsets(struct trace_event_call *call)
+{
+	struct ftrace_event_field *tail;
+	struct list_head *head;
+
+	head = trace_get_fields(call);
+	/*
+	 * head->next points to the last field with the largest offset,
+	 * since it was added last by trace_define_field()
+	 */
+	tail = list_first_entry(head, struct ftrace_event_field, link);
+	return tail->offset + tail->size;
+}
+
 int trace_event_raw_init(struct trace_event_call *call)
 {
 	int id;

From c07660409ec954403776200cec1dd04b2db851f8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:29 -0700
Subject: [PATCH 08/10] samples/bpf: add tracepoint support to bpf loader

Recognize "tracepoint/" section name prefix and attach the program
to that tracepoint.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/bpf_load.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 58f86bd11b3d2..022af71c2bb5c 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -49,6 +49,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_socket = strncmp(event, "socket", 6) == 0;
 	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
 	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
+	bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
 	enum bpf_prog_type prog_type;
 	char buf[256];
 	int fd, efd, err, id;
@@ -63,6 +64,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
 	} else if (is_kprobe || is_kretprobe) {
 		prog_type = BPF_PROG_TYPE_KPROBE;
+	} else if (is_tracepoint) {
+		prog_type = BPF_PROG_TYPE_TRACEPOINT;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -111,12 +114,23 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 			       event, strerror(errno));
 			return -1;
 		}
-	}
 
-	strcpy(buf, DEBUGFS);
-	strcat(buf, "events/kprobes/");
-	strcat(buf, event);
-	strcat(buf, "/id");
+		strcpy(buf, DEBUGFS);
+		strcat(buf, "events/kprobes/");
+		strcat(buf, event);
+		strcat(buf, "/id");
+	} else if (is_tracepoint) {
+		event += 11;
+
+		if (*event == 0) {
+			printf("event name cannot be empty\n");
+			return -1;
+		}
+		strcpy(buf, DEBUGFS);
+		strcat(buf, "events/");
+		strcat(buf, event);
+		strcat(buf, "/id");
+	}
 
 	efd = open(buf, O_RDONLY, 0);
 	if (efd < 0) {
@@ -304,6 +318,7 @@ int load_bpf_file(char *path)
 
 			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
 			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
+			    memcmp(shname_prog, "tracepoint/", 11) == 0 ||
 			    memcmp(shname_prog, "socket", 6) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
@@ -320,6 +335,7 @@ int load_bpf_file(char *path)
 
 		if (memcmp(shname, "kprobe/", 7) == 0 ||
 		    memcmp(shname, "kretprobe/", 10) == 0 ||
+		    memcmp(shname, "tracepoint/", 11) == 0 ||
 		    memcmp(shname, "socket", 6) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}

From 3c9b16448cf6924c203e3c01696c87fcbfb71fc6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:30 -0700
Subject: [PATCH 09/10] samples/bpf: tracepoint example

modify offwaketime to work with sched/sched_switch tracepoint
instead of kprobe into finish_task_switch

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/offwaketime_kern.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c
index c0aa5a9b9c48c..983629a31c79f 100644
--- a/samples/bpf/offwaketime_kern.c
+++ b/samples/bpf/offwaketime_kern.c
@@ -73,7 +73,7 @@ int waker(struct pt_regs *ctx)
 	return 0;
 }
 
-static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta)
+static inline int update_counts(void *ctx, u32 pid, u64 delta)
 {
 	struct key_t key = {};
 	struct wokeby_t *woke;
@@ -100,15 +100,33 @@ static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta)
 	return 0;
 }
 
+#if 1
+/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
+struct sched_switch_args {
+	unsigned long long pad;
+	char prev_comm[16];
+	int prev_pid;
+	int prev_prio;
+	long long prev_state;
+	char next_comm[16];
+	int next_pid;
+	int next_prio;
+};
+SEC("tracepoint/sched/sched_switch")
+int oncpu(struct sched_switch_args *ctx)
+{
+	/* record previous thread sleep time */
+	u32 pid = ctx->prev_pid;
+#else
 SEC("kprobe/finish_task_switch")
 int oncpu(struct pt_regs *ctx)
 {
 	struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
+	/* record previous thread sleep time */
+	u32 pid = _(p->pid);
+#endif
 	u64 delta, ts, *tsp;
-	u32 pid;
 
-	/* record previous thread sleep time */
-	pid = _(p->pid);
 	ts = bpf_ktime_get_ns();
 	bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
 

From e3edfdec04d43aa6276db639d3721e073161d2c2 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:31 -0700
Subject: [PATCH 10/10] samples/bpf: add tracepoint vs kprobe performance tests

the first microbenchmark does
fd=open("/proc/self/comm");
for() {
  write(fd, "test");
}
and on 4 cpus in parallel:
                                      writes per sec
base (no tracepoints, no kprobes)         930k
with kprobe at __set_task_comm()          420k
with tracepoint at task:task_rename       730k

For kprobe + full bpf program manully fetches oldcomm, newcomm via bpf_probe_read.
For tracepint bpf program does nothing, since arguments are copied by tracepoint.

2nd microbenchmark does:
fd=open("/dev/urandom");
for() {
  read(fd, buf);
}
and on 4 cpus in parallel:
                                       reads per sec
base (no tracepoints, no kprobes)         300k
with kprobe at urandom_read()             279k
with tracepoint at random:urandom_read    290k

bpf progs attached to kprobe and tracepoint are noop.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/Makefile                    |   5 +
 samples/bpf/test_overhead_kprobe_kern.c |  41 ++++++
 samples/bpf/test_overhead_tp_kern.c     |  36 ++++++
 samples/bpf/test_overhead_user.c        | 162 ++++++++++++++++++++++++
 4 files changed, 244 insertions(+)
 create mode 100644 samples/bpf/test_overhead_kprobe_kern.c
 create mode 100644 samples/bpf/test_overhead_tp_kern.c
 create mode 100644 samples/bpf/test_overhead_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 502c9fc8db85d..9959771bf8088 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -19,6 +19,7 @@ hostprogs-y += lathist
 hostprogs-y += offwaketime
 hostprogs-y += spintest
 hostprogs-y += map_perf_test
+hostprogs-y += test_overhead
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -38,6 +39,7 @@ lathist-objs := bpf_load.o libbpf.o lathist_user.o
 offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
 spintest-objs := bpf_load.o libbpf.o spintest_user.o
 map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
+test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -56,6 +58,8 @@ always += lathist_kern.o
 always += offwaketime_kern.o
 always += spintest_kern.o
 always += map_perf_test_kern.o
+always += test_overhead_tp_kern.o
+always += test_overhead_kprobe_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -75,6 +79,7 @@ HOSTLOADLIBES_lathist += -lelf
 HOSTLOADLIBES_offwaketime += -lelf
 HOSTLOADLIBES_spintest += -lelf
 HOSTLOADLIBES_map_perf_test += -lelf -lrt
+HOSTLOADLIBES_test_overhead += -lelf -lrt
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c
new file mode 100644
index 0000000000000..468a66a92ef9e
--- /dev/null
+++ b/samples/bpf/test_overhead_kprobe_kern.c
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+SEC("kprobe/__set_task_comm")
+int prog(struct pt_regs *ctx)
+{
+	struct signal_struct *signal;
+	struct task_struct *tsk;
+	char oldcomm[16] = {};
+	char newcomm[16] = {};
+	u16 oom_score_adj;
+	u32 pid;
+
+	tsk = (void *)PT_REGS_PARM1(ctx);
+
+	pid = _(tsk->pid);
+	bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm);
+	bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx));
+	signal = _(tsk->signal);
+	oom_score_adj = _(signal->oom_score_adj);
+	return 0;
+}
+
+SEC("kprobe/urandom_read")
+int prog2(struct pt_regs *ctx)
+{
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c
new file mode 100644
index 0000000000000..38f5c0b9da9f4
--- /dev/null
+++ b/samples/bpf/test_overhead_tp_kern.c
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* from /sys/kernel/debug/tracing/events/task/task_rename/format */
+struct task_rename {
+	__u64 pad;
+	__u32 pid;
+	char oldcomm[16];
+	char newcomm[16];
+	__u16 oom_score_adj;
+};
+SEC("tracepoint/task/task_rename")
+int prog(struct task_rename *ctx)
+{
+	return 0;
+}
+
+/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */
+struct urandom_read {
+	__u64 pad;
+	int got_bits;
+	int pool_left;
+	int input_left;
+};
+SEC("tracepoint/random/urandom_read")
+int prog2(struct urandom_read *ctx)
+{
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c
new file mode 100644
index 0000000000000..d291167fd3c78
--- /dev/null
+++ b/samples/bpf/test_overhead_user.c
@@ -0,0 +1,162 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <asm/unistd.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <time.h>
+#include <sys/resource.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_CNT 1000000
+
+static __u64 time_get_ns(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static void test_task_rename(int cpu)
+{
+	__u64 start_time;
+	char buf[] = "test\n";
+	int i, fd;
+
+	fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
+	if (fd < 0) {
+		printf("couldn't open /proc\n");
+		exit(1);
+	}
+	start_time = time_get_ns();
+	for (i = 0; i < MAX_CNT; i++)
+		write(fd, buf, sizeof(buf));
+	printf("task_rename:%d: %lld events per sec\n",
+	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	close(fd);
+}
+
+static void test_urandom_read(int cpu)
+{
+	__u64 start_time;
+	char buf[4];
+	int i, fd;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd < 0) {
+		printf("couldn't open /dev/urandom\n");
+		exit(1);
+	}
+	start_time = time_get_ns();
+	for (i = 0; i < MAX_CNT; i++)
+		read(fd, buf, sizeof(buf));
+	printf("urandom_read:%d: %lld events per sec\n",
+	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	close(fd);
+}
+
+static void loop(int cpu, int flags)
+{
+	cpu_set_t cpuset;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(cpu, &cpuset);
+	sched_setaffinity(0, sizeof(cpuset), &cpuset);
+
+	if (flags & 1)
+		test_task_rename(cpu);
+	if (flags & 2)
+		test_urandom_read(cpu);
+}
+
+static void run_perf_test(int tasks, int flags)
+{
+	pid_t pid[tasks];
+	int i;
+
+	for (i = 0; i < tasks; i++) {
+		pid[i] = fork();
+		if (pid[i] == 0) {
+			loop(i, flags);
+			exit(0);
+		} else if (pid[i] == -1) {
+			printf("couldn't spawn #%d process\n", i);
+			exit(1);
+		}
+	}
+	for (i = 0; i < tasks; i++) {
+		int status;
+
+		assert(waitpid(pid[i], &status, 0) == pid[i]);
+		assert(status == 0);
+	}
+}
+
+static void unload_progs(void)
+{
+	close(prog_fd[0]);
+	close(prog_fd[1]);
+	close(event_fd[0]);
+	close(event_fd[1]);
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	char filename[256];
+	int num_cpu = 8;
+	int test_flags = ~0;
+
+	setrlimit(RLIMIT_MEMLOCK, &r);
+
+	if (argc > 1)
+		test_flags = atoi(argv[1]) ? : test_flags;
+	if (argc > 2)
+		num_cpu = atoi(argv[2]) ? : num_cpu;
+
+	if (test_flags & 0x3) {
+		printf("BASE\n");
+		run_perf_test(num_cpu, test_flags);
+	}
+
+	if (test_flags & 0xC) {
+		snprintf(filename, sizeof(filename),
+			 "%s_kprobe_kern.o", argv[0]);
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+		printf("w/KPROBE\n");
+		run_perf_test(num_cpu, test_flags >> 2);
+		unload_progs();
+	}
+
+	if (test_flags & 0x30) {
+		snprintf(filename, sizeof(filename),
+			 "%s_tp_kern.o", argv[0]);
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+		printf("w/TRACEPOINT\n");
+		run_perf_test(num_cpu, test_flags >> 4);
+		unload_progs();
+	}
+
+	return 0;
+}