Skip to content

Commit

Permalink
Merge branch 'bpf-raw-tracepoints'
Browse files Browse the repository at this point in the history
Alexei Starovoitov says:

====================
v7->v8:
- moved 'u32 num_args' from 'struct tracepoint' into 'struct bpf_raw_event_map'
  that increases memory overhead, but can be optimized/compressed later.
  Now it's zero changes in tracepoint.[ch]

v6->v7:
- adopted Steven's bpf_raw_tp_map section approach to find tracepoint
  and corresponding bpf probe function instead of kallsyms approach.
  dropped kernel_tracepoint_find_by_name() patch

v5->v6:
- avoid changing semantics of for_each_kernel_tracepoint() function, instead
  introduce kernel_tracepoint_find_by_name() helper

v4->v5:
- adopted Daniel's fancy REPEAT macro in bpf_trace.c in patch 6

v3->v4:
- adopted Linus's CAST_TO_U64 macro to cast any integer, pointer, or small
  struct to u64. That nicely reduced the size of patch 1

v2->v3:
- with Linus's suggestion introduced generic COUNT_ARGS and CONCATENATE macros
  (or rather moved them from apparmor)
  that cleaned up patch 6
- added patch 4 to refactor trace_iwlwifi_dev_ucode_error() from 17 args to 4
  Now any tracepoint with >12 args will have build error

v1->v2:
- simplified api by combing bpf_raw_tp_open(name) + bpf_attach(prog_fd) into
  bpf_raw_tp_open(name, prog_fd) as suggested by Daniel.
  That simplifies bpf_detach as well which is now simple close() of fd.
- fixed memory leak in error path which was spotted by Daniel.
- fixed bpf_get_stackid(), bpf_perf_event_output() called from raw tracepoints
- added more tests
- fixed allyesconfig build caught by buildbot

v1:
This patch set is a different way to address the pressing need to access
task_struct pointers in sched tracepoints from bpf programs.

The first approach simply added these pointers to sched tracepoints:
https://lkml.org/lkml/2017/12/14/753
which Peter nacked.
Few options were discussed and eventually the discussion converged on
doing bpf specific tracepoint_probe_register() probe functions.
Details here:
https://lkml.org/lkml/2017/12/20/929

Patch 1 is kernel wide cleanup of pass-struct-by-value into
pass-struct-by-reference into tracepoints.

Patches 2 and 3 are minor cleanups to address allyesconfig build

Patch 4 refactor trace_iwlwifi_dev_ucode_error from 17 to 4 args

Patch 5 introduces COUNT_ARGS macro

Patch 6 introduces BPF_RAW_TRACEPOINT api.
the auto-cleanup and multiple concurrent users are must have
features of tracing api. For bpf raw tracepoints it looks like:
  // load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type
  prog_fd = bpf_prog_load(...);

  // receive anon_inode fd for given bpf_raw_tracepoint
  // and attach bpf program to it
  raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd);

Ctrl-C of tracing daemon or cmdline tool will automatically
detach bpf program, unload it and unregister tracepoint probe.
More details in patch 6.

Patch 7 - trivial support in libbpf
Patches 8, 9 - user space tests

samples/bpf/test_overhead performance on 1 cpu:

tracepoint    base  kprobe+bpf tracepoint+bpf raw_tracepoint+bpf
task_rename   1.1M   769K        947K            1.0M
urandom_read  789K   697K        750K            755K
====================

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
  • Loading branch information
Daniel Borkmann committed Mar 28, 2018
2 parents 6f5c39f + 3bbe086 commit f6ef565
Show file tree
Hide file tree
Showing 30 changed files with 607 additions and 77 deletions.
2 changes: 1 addition & 1 deletion drivers/infiniband/hw/hfi1/file_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1153,7 +1153,7 @@ static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
cinfo.sdma_ring_size = fd->cq->nentries;
cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;

trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, &cinfo);
if (copy_to_user((void __user *)arg, &cinfo, len))
return -EFAULT;

Expand Down
12 changes: 6 additions & 6 deletions drivers/infiniband/hw/hfi1/trace_ctxts.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ TRACE_EVENT(hfi1_uctxtdata,
TRACE_EVENT(hfi1_ctxt_info,
TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt,
unsigned int subctxt,
struct hfi1_ctxt_info cinfo),
struct hfi1_ctxt_info *cinfo),
TP_ARGS(dd, ctxt, subctxt, cinfo),
TP_STRUCT__entry(DD_DEV_ENTRY(dd)
__field(unsigned int, ctxt)
Expand All @@ -120,11 +120,11 @@ TRACE_EVENT(hfi1_ctxt_info,
TP_fast_assign(DD_DEV_ASSIGN(dd);
__entry->ctxt = ctxt;
__entry->subctxt = subctxt;
__entry->egrtids = cinfo.egrtids;
__entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
__entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
__entry->sdma_ring_size = cinfo.sdma_ring_size;
__entry->rcvegr_size = cinfo.rcvegr_size;
__entry->egrtids = cinfo->egrtids;
__entry->rcvhdrq_cnt = cinfo->rcvhdrq_cnt;
__entry->rcvhdrq_size = cinfo->rcvhdrq_entsize;
__entry->sdma_ring_size = cinfo->sdma_ring_size;
__entry->rcvegr_size = cinfo->rcvegr_size;
),
TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
__get_str(dev),
Expand Down
7 changes: 1 addition & 6 deletions drivers/net/wireless/intel/iwlwifi/dvm/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1651,12 +1651,7 @@ static void iwl_dump_nic_error_log(struct iwl_priv *priv)
priv->status, table.valid);
}

trace_iwlwifi_dev_ucode_error(trans->dev, table.error_id, table.tsf_low,
table.data1, table.data2, table.line,
table.blink2, table.ilink1, table.ilink2,
table.bcon_time, table.gp1, table.gp2,
table.gp3, table.ucode_ver, table.hw_ver,
0, table.brd_ver);
trace_iwlwifi_dev_ucode_error(trans->dev, &table, 0, table.brd_ver);
IWL_ERR(priv, "0x%08X | %-28s\n", table.error_id,
desc_lookup(table.error_id));
IWL_ERR(priv, "0x%08X | uPc\n", table.pc);
Expand Down
39 changes: 18 additions & 21 deletions drivers/net/wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,11 @@ TRACE_EVENT(iwlwifi_dev_tx,
__entry->framelen, __entry->skbaddr)
);

struct iwl_error_event_table;
TRACE_EVENT(iwlwifi_dev_ucode_error,
TP_PROTO(const struct device *dev, u32 desc, u32 tsf_low,
u32 data1, u32 data2, u32 line, u32 blink2, u32 ilink1,
u32 ilink2, u32 bcon_time, u32 gp1, u32 gp2, u32 rev_type,
u32 major, u32 minor, u32 hw_ver, u32 brd_ver),
TP_ARGS(dev, desc, tsf_low, data1, data2, line,
blink2, ilink1, ilink2, bcon_time, gp1, gp2,
rev_type, major, minor, hw_ver, brd_ver),
TP_PROTO(const struct device *dev, const struct iwl_error_event_table *table,
u32 hw_ver, u32 brd_ver),
TP_ARGS(dev, table, hw_ver, brd_ver),
TP_STRUCT__entry(
DEV_ENTRY
__field(u32, desc)
Expand All @@ -155,20 +152,20 @@ TRACE_EVENT(iwlwifi_dev_ucode_error,
),
TP_fast_assign(
DEV_ASSIGN;
__entry->desc = desc;
__entry->tsf_low = tsf_low;
__entry->data1 = data1;
__entry->data2 = data2;
__entry->line = line;
__entry->blink2 = blink2;
__entry->ilink1 = ilink1;
__entry->ilink2 = ilink2;
__entry->bcon_time = bcon_time;
__entry->gp1 = gp1;
__entry->gp2 = gp2;
__entry->rev_type = rev_type;
__entry->major = major;
__entry->minor = minor;
__entry->desc = table->error_id;
__entry->tsf_low = table->tsf_low;
__entry->data1 = table->data1;
__entry->data2 = table->data2;
__entry->line = table->line;
__entry->blink2 = table->blink2;
__entry->ilink1 = table->ilink1;
__entry->ilink2 = table->ilink2;
__entry->bcon_time = table->bcon_time;
__entry->gp1 = table->gp1;
__entry->gp2 = table->gp2;
__entry->rev_type = table->gp3;
__entry->major = table->ucode_ver;
__entry->minor = table->hw_ver;
__entry->hw_ver = hw_ver;
__entry->brd_ver = brd_ver;
),
Expand Down
1 change: 1 addition & 0 deletions drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#ifndef __CHECKER__
#include "iwl-trans.h"

#include "dvm/commands.h"
#define CREATE_TRACE_POINTS
#include "iwl-devtrace.h"

Expand Down
7 changes: 1 addition & 6 deletions drivers/net/wireless/intel/iwlwifi/mvm/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -549,12 +549,7 @@ static void iwl_mvm_dump_lmac_error_log(struct iwl_mvm *mvm, u32 base)

IWL_ERR(mvm, "Loaded firmware version: %s\n", mvm->fw->fw_version);

trace_iwlwifi_dev_ucode_error(trans->dev, table.error_id, table.tsf_low,
table.data1, table.data2, table.data3,
table.blink2, table.ilink1,
table.ilink2, table.bcon_time, table.gp1,
table.gp2, table.fw_rev_type, table.major,
table.minor, table.hw_ver, table.brd_ver);
trace_iwlwifi_dev_ucode_error(trans->dev, &table, table.hw_ver, table.brd_ver);
IWL_ERR(mvm, "0x%08X | %-28s\n", table.error_id,
desc_lookup(table.error_id));
IWL_ERR(mvm, "0x%08X | trm_hw_status0\n", table.trm_hw_status0);
Expand Down
6 changes: 3 additions & 3 deletions drivers/net/wireless/mediatek/mt7601u/trace.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
#define REG_PR_FMT "%04x=%08x"
#define REG_PR_ARG __entry->reg, __entry->val

DECLARE_EVENT_CLASS(dev_reg_evt,
DECLARE_EVENT_CLASS(dev_reg_evtu,
TP_PROTO(struct mt7601u_dev *dev, u32 reg, u32 val),
TP_ARGS(dev, reg, val),
TP_STRUCT__entry(
Expand All @@ -51,12 +51,12 @@ DECLARE_EVENT_CLASS(dev_reg_evt,
)
);

DEFINE_EVENT(dev_reg_evt, reg_read,
DEFINE_EVENT(dev_reg_evtu, reg_read,
TP_PROTO(struct mt7601u_dev *dev, u32 reg, u32 val),
TP_ARGS(dev, reg, val)
);

DEFINE_EVENT(dev_reg_evt, reg_write,
DEFINE_EVENT(dev_reg_evtu, reg_write,
TP_PROTO(struct mt7601u_dev *dev, u32 reg, u32 val),
TP_ARGS(dev, reg, val)
);
Expand Down
10 changes: 10 additions & 0 deletions include/asm-generic/vmlinux.lds.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,15 @@
#define TRACE_SYSCALLS()
#endif

#ifdef CONFIG_BPF_EVENTS
#define BPF_RAW_TP() STRUCT_ALIGN(); \
VMLINUX_SYMBOL(__start__bpf_raw_tp) = .; \
KEEP(*(__bpf_raw_tp_map)) \
VMLINUX_SYMBOL(__stop__bpf_raw_tp) = .;
#else
#define BPF_RAW_TP()
#endif

#ifdef CONFIG_SERIAL_EARLYCON
#define EARLYCON_TABLE() STRUCT_ALIGN(); \
VMLINUX_SYMBOL(__earlycon_table) = .; \
Expand Down Expand Up @@ -249,6 +258,7 @@
LIKELY_PROFILE() \
BRANCH_PROFILE() \
TRACE_PRINTKS() \
BPF_RAW_TP() \
TRACEPOINT_STR()

/*
Expand Down
1 change: 1 addition & 0 deletions include/linux/bpf_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
#endif
#ifdef CONFIG_CGROUP_BPF
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
Expand Down
7 changes: 7 additions & 0 deletions include/linux/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
#define swap(a, b) \
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

/* This counts to 12. Any more, it will return 13th argument. */
#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)

#define __CONCAT(a, b) a ## b
#define CONCATENATE(a, b) __CONCAT(a, b)

/**
* container_of - cast a member of a structure out to the containing structure
* @ptr: the pointer to the member.
Expand Down
42 changes: 42 additions & 0 deletions include/linux/trace_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
void perf_event_detach_bpf_prog(struct perf_event *event);
int perf_event_query_prog_array(struct perf_event *event, void __user *info);
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
#else
static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
Expand All @@ -487,6 +490,18 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info)
{
return -EOPNOTSUPP;
}
static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p)
{
return -EOPNOTSUPP;
}
static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p)
{
return -EOPNOTSUPP;
}
static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
{
return NULL;
}
#endif

enum {
Expand Down Expand Up @@ -546,6 +561,33 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
void perf_trace_buf_update(void *record, u16 type);
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);

void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3);
void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3, u64 arg4);
void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3, u64 arg4, u64 arg5);
void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3, u64 arg4, u64 arg5, u64 arg6);
void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7);
void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
u64 arg8);
void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
u64 arg8, u64 arg9);
void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
u64 arg8, u64 arg9, u64 arg10);
void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
u64 arg8, u64 arg9, u64 arg10, u64 arg11);
void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2,
u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12);
void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct trace_event_call *call, u64 count,
struct pt_regs *regs, struct hlist_head *head,
Expand Down
6 changes: 6 additions & 0 deletions include/linux/tracepoint-defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,10 @@ struct tracepoint {
struct tracepoint_func __rcu *funcs;
};

struct bpf_raw_event_map {
struct tracepoint *tp;
void *bpf_func;
u32 num_args;
} __aligned(32);

#endif
92 changes: 92 additions & 0 deletions include/trace/bpf_probe.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/* SPDX-License-Identifier: GPL-2.0 */

#undef TRACE_SYSTEM_VAR

#ifdef CONFIG_BPF_EVENTS

#undef __entry
#define __entry entry

#undef __get_dynamic_array
#define __get_dynamic_array(field) \
((void *)__entry + (__entry->__data_loc_##field & 0xffff))

#undef __get_dynamic_array_len
#define __get_dynamic_array_len(field) \
((__entry->__data_loc_##field >> 16) & 0xffff)

#undef __get_str
#define __get_str(field) ((char *)__get_dynamic_array(field))

#undef __get_bitmask
#define __get_bitmask(field) (char *)__get_dynamic_array(field)

#undef __perf_count
#define __perf_count(c) (c)

#undef __perf_task
#define __perf_task(t) (t)

/* cast any integer, pointer, or small struct to u64 */
#define UINTTYPE(size) \
__typeof__(__builtin_choose_expr(size == 1, (u8)1, \
__builtin_choose_expr(size == 2, (u16)2, \
__builtin_choose_expr(size == 4, (u32)3, \
__builtin_choose_expr(size == 8, (u64)4, \
(void)5)))))
#define __CAST_TO_U64(x) ({ \
typeof(x) __src = (x); \
UINTTYPE(sizeof(x)) __dst; \
memcpy(&__dst, &__src, sizeof(__dst)); \
(u64)__dst; })

#define __CAST1(a,...) __CAST_TO_U64(a)
#define __CAST2(a,...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__)
#define __CAST3(a,...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__)
#define __CAST4(a,...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__)
#define __CAST5(a,...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__)
#define __CAST6(a,...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__)
#define __CAST7(a,...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__)
#define __CAST8(a,...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__)
#define __CAST9(a,...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__)
#define __CAST10(a,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__)
#define __CAST11(a,...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__)
#define __CAST12(a,...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__)
/* tracepoints with more than 12 arguments will hit build error */
#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)

#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static notrace void \
__bpf_trace_##call(void *__data, proto) \
{ \
struct bpf_prog *prog = __data; \
CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \
}

/*
* This part is compiled out, it is only here as a build time check
* to make sure that if the tracepoint handling changes, the
* bpf probe will fail to compile unless it too is updated.
*/
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args) \
static inline void bpf_test_probe_##call(void) \
{ \
check_trace_callback_type_##call(__bpf_trace_##template); \
} \
static struct bpf_raw_event_map __used \
__attribute__((section("__bpf_raw_tp_map"))) \
__bpf_trace_tp_map_##call = { \
.tp = &__tracepoint_##call, \
.bpf_func = (void *)__bpf_trace_##template, \
.num_args = COUNT_ARGS(args), \
};


#undef DEFINE_EVENT_PRINT
#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))

#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
#endif /* CONFIG_BPF_EVENTS */
1 change: 1 addition & 0 deletions include/trace/define_trace.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
#ifdef TRACEPOINTS_ENABLED
#include <trace/trace_events.h>
#include <trace/perf.h>
#include <trace/bpf_probe.h>
#endif

#undef TRACE_EVENT
Expand Down
2 changes: 1 addition & 1 deletion include/trace/events/f2fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_node,

TRACE_EVENT(f2fs_truncate_partial_nodes,

TP_PROTO(struct inode *inode, nid_t nid[], int depth, int err),
TP_PROTO(struct inode *inode, nid_t *nid, int depth, int err),

TP_ARGS(inode, nid, depth, err),

Expand Down
Loading

0 comments on commit f6ef565

Please sign in to comment.