Skip to content

Commit

Permalink
Merge branch 'bpf-get-stackid'
Browse files Browse the repository at this point in the history
Alexei Starovoitov says:

====================
bpf_get_stackid() and stack_trace map

This patch set introduces new map type to store stack traces and
corresponding bpf_get_stackid() helper.
BPF programs already can walk the stack via unrolled loop
of bpf_probe_read()s which is ok for simple analysis, but it's
not efficient and limited to <30 frames after that the programs
don't fit into MAX_BPF_STACK. With bpf_get_stackid() helper
the programs can collect up to PERF_MAX_STACK_DEPTH both
user and kernel frames.
Using stack traces as a key in a map turned out to be very useful
for generating flame graphs, off-cpu graphs, waker and chain graphs.
Patch 3 is a simplified version of 'offwaketime' tool which is
described in detail here:
http://brendangregg.com/blog/2016-02-01/linux-wakeup-offwake-profiling.html

Earlier version of this patch were using save_stack_trace() helper,
but 'unreliable' frames add to much noise and two equiavlent
stack traces produce different 'stackid's.
Using lockdep style of storing frames with MAX_STACK_TRACE_ENTRIES is
great for lockdep, but not acceptable for bpf, since the stack_trace
map needs to be freed when user Ctrl-C the tool.
The ftrace style with per_cpu(struct ftrace_stack) is great, but it's
tightly coupled with ftrace ring buffer and has the same 'unreliable'
noise. perf_event's perf_callchain() mechanism is also very efficient
and it only needed minor generalization which is done in patch 1
to be used by bpf stack_trace maps.
Peter, please take a look at patch 1.
If you're ok with it, I'd like to take the whole set via net-next.

Patch 1 - generalization of perf_callchain()
Patch 2 - stack_trace map done as lock-less hashtable without link list
  to avoid spinlock on insertion which is critical path when
  bpf_get_stackid() helper is called for every task switch event
Patch 3 - offwaketime example

After the patch the 'perf report' for artificial 'sched_bench'
benchmark that doing pthread_cond_wait/signal and 'offwaketime'
example is running in the background:
 16.35%  swapper      [kernel.vmlinux]    [k] intel_idle
  2.18%  sched_bench  [kernel.vmlinux]    [k] __switch_to
  2.18%  sched_bench  libpthread-2.12.so  [.] pthread_cond_signal@@GLIBC_2.3.2
  1.72%  sched_bench  libpthread-2.12.so  [.] pthread_mutex_unlock
  1.53%  sched_bench  [kernel.vmlinux]    [k] bpf_get_stackid
  1.44%  sched_bench  [kernel.vmlinux]    [k] entry_SYSCALL_64
  1.39%  sched_bench  [kernel.vmlinux]    [k] __call_rcu.constprop.73
  1.13%  sched_bench  libpthread-2.12.so  [.] pthread_mutex_lock
  1.07%  sched_bench  libpthread-2.12.so  [.] pthread_cond_wait@@GLIBC_2.3.2
  1.07%  sched_bench  [kernel.vmlinux]    [k] hash_futex
  1.05%  sched_bench  [kernel.vmlinux]    [k] do_futex
  1.05%  sched_bench  [kernel.vmlinux]    [k] get_futex_key_refs.isra.13

The hotest part of bpf_get_stackid() is inlined jhash2, so we may consider
using some faster hash in the future, but it's good enough for now.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Feb 20, 2016
2 parents 6b83d28 + a6ffe7b commit 80c804b
Show file tree
Hide file tree
Showing 18 changed files with 642 additions and 30 deletions.
2 changes: 1 addition & 1 deletion arch/x86/include/asm/stacktrace.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ print_context_stack_bp(struct thread_info *tinfo,
/* Generic stack tracer with callbacks */

struct stacktrace_ops {
void (*address)(void *data, unsigned long address, int reliable);
int (*address)(void *data, unsigned long address, int reliable);
/* On negative return stop dumping */
int (*stack)(void *data, char *name);
walk_stack_t walk_stack;
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kernel/cpu/perf_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -2180,11 +2180,11 @@ static int backtrace_stack(void *data, char *name)
return 0;
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
static int backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;

perf_callchain_store(entry, addr);
return perf_callchain_store(entry, addr);
}

static const struct stacktrace_ops backtrace_ops = {
Expand Down
6 changes: 4 additions & 2 deletions arch/x86/kernel/dumpstack.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo,
if (!__kernel_text_address(addr))
break;

ops->address(data, addr, 1);
if (ops->address(data, addr, 1))
break;
frame = frame->next_frame;
ret_addr = &frame->return_address;
print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
Expand All @@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name)
/*
* Print one address/symbol entries per line.
*/
static void print_trace_address(void *data, unsigned long addr, int reliable)
static int print_trace_address(void *data, unsigned long addr, int reliable)
{
touch_nmi_watchdog();
printk_stack_address(addr, reliable, data);
return 0;
}

static const struct stacktrace_ops print_trace_ops = {
Expand Down
18 changes: 11 additions & 7 deletions arch/x86/kernel/stacktrace.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name)
return 0;
}

static void
static int
__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
{
struct stack_trace *trace = data;
#ifdef CONFIG_FRAME_POINTER
if (!reliable)
return;
return 0;
#endif
if (nosched && in_sched_functions(addr))
return;
return 0;
if (trace->skip > 0) {
trace->skip--;
return;
return 0;
}
if (trace->nr_entries < trace->max_entries)
if (trace->nr_entries < trace->max_entries) {
trace->entries[trace->nr_entries++] = addr;
return 0;
} else {
return -1; /* no more room, stop walking the stack */
}
}

static void save_stack_address(void *data, unsigned long addr, int reliable)
static int save_stack_address(void *data, unsigned long addr, int reliable)
{
return __save_stack_address(data, addr, reliable, false);
}

static void
static int
save_stack_address_nosched(void *data, unsigned long addr, int reliable)
{
return __save_stack_address(data, addr, reliable, true);
Expand Down
3 changes: 2 additions & 1 deletion arch/x86/oprofile/backtrace.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@ static int backtrace_stack(void *data, char *name)
return 0;
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
static int backtrace_address(void *data, unsigned long addr, int reliable)
{
unsigned int *depth = data;

if ((*depth)--)
oprofile_add_trace(addr);
return 0;
}

static struct stacktrace_ops backtrace_ops = {
Expand Down
1 change: 1 addition & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;

/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
Expand Down
13 changes: 11 additions & 2 deletions include/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -964,11 +964,20 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
bool crosstask, bool add_mark);
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);

static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
{
if (entry->nr < PERF_MAX_STACK_DEPTH)
if (entry->nr < PERF_MAX_STACK_DEPTH) {
entry->ip[entry->nr++] = ip;
return 0;
} else {
return -1; /* no more room, stop walking the stack */
}
}

extern int sysctl_perf_event_paranoid;
Expand Down
21 changes: 21 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_PERF_EVENT_ARRAY,
BPF_MAP_TYPE_PERCPU_HASH,
BPF_MAP_TYPE_PERCPU_ARRAY,
BPF_MAP_TYPE_STACK_TRACE,
};

enum bpf_prog_type {
Expand Down Expand Up @@ -272,6 +273,20 @@ enum bpf_func_id {
*/
BPF_FUNC_perf_event_output,
BPF_FUNC_skb_load_bytes,

/**
* bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
* @ctx: struct pt_regs*
* @map: pointer to stack_trace map
* @flags: bits 0-7 - numer of stack frames to skip
* bit 8 - collect user stack instead of kernel
* bit 9 - compare stacks by hash only
* bit 10 - if two different stacks hash into the same stackid
* discard old
* other bits - reserved
* Return: >= 0 stackid on success or negative error
*/
BPF_FUNC_get_stackid,
__BPF_FUNC_MAX_ID,
};

Expand All @@ -294,6 +309,12 @@ enum bpf_func_id {
/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
#define BPF_F_TUNINFO_IPV6 (1ULL << 0)

/* BPF_FUNC_get_stackid flags. */
#define BPF_F_SKIP_FIELD_MASK 0xffULL
#define BPF_F_USER_STACK (1ULL << 8)
#define BPF_F_FAST_STACK_CMP (1ULL << 9)
#define BPF_F_REUSE_STACKID (1ULL << 10)

/* user accessible mirror of in-kernel sk_buff.
* new fields can only be added to the end of this structure
*/
Expand Down
3 changes: 3 additions & 0 deletions kernel/bpf/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ obj-y := core.o

obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
Loading

0 comments on commit 80c804b

Please sign in to comment.