Skip to content

Commit

Permalink
Merge branch 'bpf-tracepoints'
Browse files Browse the repository at this point in the history
Alexei Starovoitov says:

====================
allow bpf attach to tracepoints

Hi Steven, Peter,

v1->v2: addressed Peter's comments:
- fixed wording in patch 1, added ack
- refactored 2nd patch into 3:
2/10 remove unused __perf_addr macro which frees up
an argument in perf_trace_buf_submit
3/10 split perf_trace_buf_prepare into alloc and update parts, so that bpf
programs don't have to pay performance penalty for update of struct trace_entry
which is not going to be accessed by bpf
4/10 actual addition of bpf filter to perf tracepoint handler is now trivial
and bpf prog can be used as proper filter of tracepoints

v1 cover:
last time we discussed bpf+tracepoints it was a year ago [1] and the reason
we didn't proceed with that approach was that bpf would make arguments
arg1, arg2 to trace_xx(arg1, arg2) call to be exposed to bpf program
and that was considered unnecessary extension of abi. Back then I wanted
to avoid the cost of buffer alloc and field assign part in all
of the tracepoints, but looks like when optimized the cost is acceptable.
So this new apporach doesn't expose any new abi to bpf program.
The program is looking at tracepoint fields after they were copied
by perf_trace_xx() and described in /sys/kernel/debug/tracing/events/xxx/format
We made a tool [2] that takes arguments from /sys/.../format and works as:
$ tplist.py -v random:urandom_read
    int got_bits;
    int pool_left;
    int input_left;
Then these fields can be copy-pasted into bpf program like:
struct urandom_read {
    __u64 hidden_pad;
    int got_bits;
    int pool_left;
    int input_left;
};
and the program can use it:
SEC("tracepoint/random/urandom_read")
int bpf_prog(struct urandom_read *ctx)
{
    return ctx->pool_left > 0 ? 1 : 0;
}
This way the program can access tracepoint fields faster than
equivalent bpf+kprobe program, which is the main goal of these patches.

Patch 1-4 are simple changes in perf core side, please review.
I'd like to take the whole set via net-next tree, since the rest of
the patches might conflict with other bpf work going on in net-next
and we want to avoid cross-tree merge conflicts.
Alternatively we can put patches 1-4 into both tip and net-next.

Patch 9 is an example of access to tracepoint fields from bpf prog.
Patch 10 is a micro benchmark for bpf+kprobe vs bpf+tracepoint.

Note that for actual tracing tools the user doesn't need to
run tplist.py and copy-paste fields manually. The tools do it
automatically. Like argdist tool [3] can be used as:
$ argdist -H 't:block:block_rq_complete():u32:nr_sector'
where 'nr_sector' is name of tracepoint field taken from
/sys/kernel/debug/tracing/events/block/block_rq_complete/format
and appropriate bpf program is generated on the fly.

[1] http://thread.gmane.org/gmane.linux.kernel.api/8127/focus=8165
[2] https://github.com/iovisor/bcc/blob/master/tools/tplist.py
[3] https://github.com/iovisor/bcc/blob/master/tools/argdist.py
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Apr 8, 2016
2 parents b33b0a1 + e3edfde commit f871165
Show file tree
Hide file tree
Showing 21 changed files with 475 additions and 69 deletions.
2 changes: 2 additions & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ struct bpf_prog_type_list {
struct bpf_prog_aux {
atomic_t refcnt;
u32 used_map_cnt;
u32 max_ctx_offset;
const struct bpf_verifier_ops *ops;
struct bpf_map **used_maps;
struct bpf_prog *prog;
Expand Down Expand Up @@ -160,6 +161,7 @@ struct bpf_array {
#define MAX_TAIL_CALL_CNT 32

u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
void bpf_fd_array_map_clear(struct bpf_map *map);
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
Expand Down
4 changes: 1 addition & 3 deletions include/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -882,8 +882,6 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo
*/
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(*regs));

perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}

Expand Down Expand Up @@ -1018,7 +1016,7 @@ static inline bool perf_paranoid_kernel(void)
}

extern void perf_event_init(void);
extern void perf_tp_event(u64 addr, u64 count, void *record,
extern void perf_tp_event(u16 event_type, u64 count, void *record,
int entry_size, struct pt_regs *regs,
struct hlist_head *head, int rctx,
struct task_struct *task);
Expand Down
9 changes: 5 additions & 4 deletions include/linux/trace_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type,
int is_signed, int filter_type);
extern int trace_add_event_call(struct trace_event_call *call);
extern int trace_remove_event_call(struct trace_event_call *call);
extern int trace_event_get_offsets(struct trace_event_call *call);

#define is_signed_type(type) (((type)(-1)) < (type)1)

Expand Down Expand Up @@ -605,15 +606,15 @@ extern void perf_trace_del(struct perf_event *event, int flags);
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str);
extern void ftrace_profile_free_filter(struct perf_event *event);
extern void *perf_trace_buf_prepare(int size, unsigned short type,
struct pt_regs **regs, int *rctxp);
void perf_trace_buf_update(void *record, u16 type);
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);

static inline void
perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
u64 count, struct pt_regs *regs, void *head,
struct task_struct *task)
{
perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
}
#endif

Expand Down
23 changes: 14 additions & 9 deletions include/trace/perf.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@
#undef __get_bitmask
#define __get_bitmask(field) (char *)__get_dynamic_array(field)

#undef __perf_addr
#define __perf_addr(a) (__addr = (a))

#undef __perf_count
#define __perf_count(c) (__count = (c))

Expand All @@ -37,8 +34,9 @@ perf_trace_##call(void *__data, proto) \
struct trace_event_call *event_call = __data; \
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
struct trace_event_raw_##call *entry; \
struct bpf_prog *prog = event_call->prog; \
struct pt_regs *__regs; \
u64 __addr = 0, __count = 1; \
u64 __count = 1; \
struct task_struct *__task = NULL; \
struct hlist_head *head; \
int __entry_size; \
Expand All @@ -48,16 +46,15 @@ perf_trace_##call(void *__data, proto) \
__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
\
head = this_cpu_ptr(event_call->perf_events); \
if (__builtin_constant_p(!__task) && !__task && \
if (!prog && __builtin_constant_p(!__task) && !__task && \
hlist_empty(head)) \
return; \
\
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
sizeof(u64)); \
__entry_size -= sizeof(u32); \
\
entry = perf_trace_buf_prepare(__entry_size, \
event_call->event.type, &__regs, &rctx); \
entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx); \
if (!entry) \
return; \
\
Expand All @@ -67,8 +64,16 @@ perf_trace_##call(void *__data, proto) \
\
{ assign; } \
\
perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
__count, __regs, head, __task); \
if (prog) { \
*(struct pt_regs **)entry = __regs; \
if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
perf_swevent_put_recursion_context(rctx); \
return; \
} \
} \
perf_trace_buf_submit(entry, __entry_size, rctx, \
event_call->event.type, __count, __regs, \
head, __task); \
}

/*
Expand Down
3 changes: 0 additions & 3 deletions include/trace/trace_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -652,9 +652,6 @@ static inline notrace int trace_event_get_offsets_##call( \
#undef TP_fast_assign
#define TP_fast_assign(args...) args

#undef __perf_addr
#define __perf_addr(a) (a)

#undef __perf_count
#define __perf_count(c) (c)

Expand Down
1 change: 1 addition & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_KPROBE,
BPF_PROG_TYPE_SCHED_CLS,
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
};

#define BPF_PSEUDO_MAP_FD 1
Expand Down
2 changes: 1 addition & 1 deletion kernel/bpf/stackmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
return ERR_PTR(err);
}

static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
{
struct pt_regs *regs = (struct pt_regs *) (long) r1;
struct bpf_map *map = (struct bpf_map *) (long) r2;
Expand Down
6 changes: 5 additions & 1 deletion kernel/bpf/verifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
enum bpf_access_type t)
{
if (env->prog->aux->ops->is_valid_access &&
env->prog->aux->ops->is_valid_access(off, size, t))
env->prog->aux->ops->is_valid_access(off, size, t)) {
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
return 0;
}

verbose("invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
Expand Down
27 changes: 21 additions & 6 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -6725,12 +6725,13 @@ int perf_swevent_get_recursion_context(void)
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);

inline void perf_swevent_put_recursion_context(int rctx)
void perf_swevent_put_recursion_context(int rctx)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

put_recursion_context(swhash->recursion, rctx);
}
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);

void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
Expand Down Expand Up @@ -6987,7 +6988,7 @@ static int perf_tp_event_match(struct perf_event *event,
return 1;
}

void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
{
Expand All @@ -6999,9 +7000,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
.data = record,
};

perf_sample_data_init(&data, addr, 0);
perf_sample_data_init(&data, 0, 0);
data.raw = &raw;

perf_trace_buf_update(record, event_type);

hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
Expand Down Expand Up @@ -7104,6 +7107,7 @@ static void perf_event_free_filter(struct perf_event *event)

static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;

if (event->attr.type != PERF_TYPE_TRACEPOINT)
Expand All @@ -7112,20 +7116,31 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
if (event->tp_event->prog)
return -EEXIST;

if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
/* bpf programs can only be attached to u/kprobes */
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
if (!is_kprobe && !is_tracepoint)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;

prog = bpf_prog_get(prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);

if (prog->type != BPF_PROG_TYPE_KPROBE) {
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}

if (is_tracepoint) {
int off = trace_event_get_offsets(event->tp_event);

if (prog->aux->max_ctx_offset > off) {
bpf_prog_put(prog);
return -EACCES;
}
}
event->tp_event->prog = prog;

return 0;
Expand Down
85 changes: 83 additions & 2 deletions kernel/trace/bpf_trace.c
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg5_type = ARG_CONST_STACK_SIZE,
};

static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
Expand All @@ -295,12 +295,20 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
default:
return NULL;
}
}

static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
default:
return NULL;
return tracing_func_proto(func_id);
}
}

Expand Down Expand Up @@ -332,9 +340,82 @@ static struct bpf_prog_type_list kprobe_tl = {
.type = BPF_PROG_TYPE_KPROBE,
};

static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
{
/*
* r1 points to perf tracepoint buffer where first 8 bytes are hidden
* from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
* from there and call the same bpf_perf_event_output() helper
*/
u64 ctx = *(long *)r1;

return bpf_perf_event_output(ctx, r2, index, r4, size);
}

static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.func = bpf_perf_event_output_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_STACK,
.arg5_type = ARG_CONST_STACK_SIZE,
};

static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
u64 ctx = *(long *)r1;

return bpf_get_stackid(ctx, r2, r3, r4, r5);
}

static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.func = bpf_get_stackid_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
};

static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
default:
return tracing_func_proto(func_id);
}
}

static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
return true;
}

static const struct bpf_verifier_ops tracepoint_prog_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = tp_prog_is_valid_access,
};

static struct bpf_prog_type_list tracepoint_tl = {
.ops = &tracepoint_prog_ops,
.type = BPF_PROG_TYPE_TRACEPOINT,
};

static int __init register_kprobe_prog_ops(void)
{
bpf_register_prog_type(&kprobe_tl);
bpf_register_prog_type(&tracepoint_tl);
return 0;
}
late_initcall(register_kprobe_prog_ops);
Loading

0 comments on commit f871165

Please sign in to comment.