Skip to content

Commit

Permalink
Merge branch 'bpf-task-fd-query'
Browse files Browse the repository at this point in the history
Yonghong Song says:

====================
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, this command will return bpf related information
to user space. Right now it only supports tracepoint/kprobe/uprobe
perf event fd's. For such a fd, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Patch #1 adds function perf_get_event() in kernel/events/core.c.
Patch #2 implements the bpf subcommand BPF_TASK_FD_QUERY.
Patch #3 syncs tools bpf.h header and also add bpf_task_fd_query()
in the libbpf library for samples/selftests/bpftool to use.
Patch #4 adds ksym_get_addr() utility function.
Patch #5 add a test in samples/bpf for querying k[ret]probes and
u[ret]probes.
Patch #6 add a test in tools/testing/selftests/bpf for querying
raw_tracepoint and tracepoint.
Patch #7 add a new subcommand "perf" to bpftool.

Changelogs:
  v4 -> v5:
     . return strlen(buf) instead of strlen(buf) + 1
       in the attr.buf_len. As long as user provides
       non-empty buffer, it will be filed with empty
       string, truncated string, or full string
       based on the buffer size and the length of
       to-be-copied string.
  v3 -> v4:
     . made attr buf_len input/output. The length of
       actual buffter is written to buf_len so user space knows
       what is actually needed. If user provides a buffer
       with length >= 1 but less than required, do partial
       copy and return -ENOSPC.
     . code simplification with put_user.
     . changed query result attach_info to fd_type.
     . add tests at selftests/bpf to test zero len, null buf and
       insufficient buf.
  v2 -> v3:
     . made perf_get_event() return perf_event pointer const.
       this was to ensure that event fields are not meddled.
     . detect whether newly BPF_TASK_FD_QUERY is supported or
       not in "bpftool perf" and warn users if it is not.
  v1 -> v2:
     . changed bpf subcommand name from BPF_PERF_EVENT_QUERY
       to BPF_TASK_FD_QUERY.
     . fixed various "bpftool perf" issues and added documentation
       and auto-completion.
====================

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
  • Loading branch information
Alexei Starovoitov committed May 25, 2018
2 parents 31ad392 + b04df40 commit f80acbd
Show file tree
Hide file tree
Showing 23 changed files with 1,257 additions and 2 deletions.
5 changes: 5 additions & 0 deletions include/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event *perf_get_event(struct file *file);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
Expand Down Expand Up @@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
static inline void perf_event_delayed_put(struct task_struct *task) { }
static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); }
static inline const struct perf_event *perf_get_event(struct file *file)
{
return ERR_PTR(-EINVAL);
}
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
return ERR_PTR(-EINVAL);
Expand Down
17 changes: 17 additions & 0 deletions include/linux/trace_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
u32 *fd_type, const char **buf,
u64 *probe_offset, u64 *probe_addr);
#else
static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
Expand Down Expand Up @@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
{
return NULL;
}
static inline int bpf_get_perf_event_info(const struct perf_event *event,
u32 *prog_id, u32 *fd_type,
const char **buf, u64 *probe_offset,
u64 *probe_addr)
{
return -EOPNOTSUPP;
}
#endif

enum {
Expand Down Expand Up @@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
#ifdef CONFIG_KPROBE_EVENTS
extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe);
extern void perf_kprobe_destroy(struct perf_event *event);
extern int bpf_get_kprobe_info(const struct perf_event *event,
u32 *fd_type, const char **symbol,
u64 *probe_offset, u64 *probe_addr,
bool perf_type_tracepoint);
#endif
#ifdef CONFIG_UPROBE_EVENTS
extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe);
extern void perf_uprobe_destroy(struct perf_event *event);
extern int bpf_get_uprobe_info(const struct perf_event *event,
u32 *fd_type, const char **filename,
u64 *probe_offset, bool perf_type_tracepoint);
#endif
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str);
Expand Down
26 changes: 26 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
BPF_TASK_FD_QUERY,
};

enum bpf_map_type {
Expand Down Expand Up @@ -380,6 +381,22 @@ union bpf_attr {
__u32 btf_log_size;
__u32 btf_log_level;
};

struct {
__u32 pid; /* input: pid */
__u32 fd; /* input: fd */
__u32 flags; /* input: flags */
__u32 buf_len; /* input/output: buf len */
__aligned_u64 buf; /* input/output:
* tp_name for tracepoint
* symbol for kprobe
* filename for uprobe
*/
__u32 prog_id; /* output: prod_id */
__u32 fd_type; /* output: BPF_FD_TYPE_* */
__u64 probe_offset; /* output: probe_offset */
__u64 probe_addr; /* output: probe_addr */
} task_fd_query;
} __attribute__((aligned(8)));

/* The description below is an attempt at providing documentation to eBPF
Expand Down Expand Up @@ -2557,4 +2574,13 @@ struct bpf_fib_lookup {
__u8 dmac[6]; /* ETH_ALEN */
};

enum bpf_task_fd_type {
BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
BPF_FD_TYPE_TRACEPOINT, /* tp name */
BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */
BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */
BPF_FD_TYPE_UPROBE, /* filename + offset */
BPF_FD_TYPE_URETPROBE, /* filename + offset */
};

#endif /* _UAPI__LINUX_BPF_H__ */
131 changes: 131 additions & 0 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>
Expand Down Expand Up @@ -2178,6 +2180,132 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
return btf_get_fd_by_id(attr->btf_id);
}

static int bpf_task_fd_query_copy(const union bpf_attr *attr,
union bpf_attr __user *uattr,
u32 prog_id, u32 fd_type,
const char *buf, u64 probe_offset,
u64 probe_addr)
{
char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
u32 len = buf ? strlen(buf) : 0, input_len;
int err = 0;

if (put_user(len, &uattr->task_fd_query.buf_len))
return -EFAULT;
input_len = attr->task_fd_query.buf_len;
if (input_len && ubuf) {
if (!len) {
/* nothing to copy, just make ubuf NULL terminated */
char zero = '\0';

if (put_user(zero, ubuf))
return -EFAULT;
} else if (input_len >= len + 1) {
/* ubuf can hold the string with NULL terminator */
if (copy_to_user(ubuf, buf, len + 1))
return -EFAULT;
} else {
/* ubuf cannot hold the string with NULL terminator,
* do a partial copy with NULL terminator.
*/
char zero = '\0';

err = -ENOSPC;
if (copy_to_user(ubuf, buf, input_len - 1))
return -EFAULT;
if (put_user(zero, ubuf + input_len - 1))
return -EFAULT;
}
}

if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
put_user(fd_type, &uattr->task_fd_query.fd_type) ||
put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
put_user(probe_addr, &uattr->task_fd_query.probe_addr))
return -EFAULT;

return err;
}

#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr

static int bpf_task_fd_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
pid_t pid = attr->task_fd_query.pid;
u32 fd = attr->task_fd_query.fd;
const struct perf_event *event;
struct files_struct *files;
struct task_struct *task;
struct file *file;
int err;

if (CHECK_ATTR(BPF_TASK_FD_QUERY))
return -EINVAL;

if (!capable(CAP_SYS_ADMIN))
return -EPERM;

if (attr->task_fd_query.flags != 0)
return -EINVAL;

task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
if (!task)
return -ENOENT;

files = get_files_struct(task);
put_task_struct(task);
if (!files)
return -ENOENT;

err = 0;
spin_lock(&files->file_lock);
file = fcheck_files(files, fd);
if (!file)
err = -EBADF;
else
get_file(file);
spin_unlock(&files->file_lock);
put_files_struct(files);

if (err)
goto out;

if (file->f_op == &bpf_raw_tp_fops) {
struct bpf_raw_tracepoint *raw_tp = file->private_data;
struct bpf_raw_event_map *btp = raw_tp->btp;

err = bpf_task_fd_query_copy(attr, uattr,
raw_tp->prog->aux->id,
BPF_FD_TYPE_RAW_TRACEPOINT,
btp->tp->name, 0, 0);
goto put_file;
}

event = perf_get_event(file);
if (!IS_ERR(event)) {
u64 probe_offset, probe_addr;
u32 prog_id, fd_type;
const char *buf;

err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
&buf, &probe_offset,
&probe_addr);
if (!err)
err = bpf_task_fd_query_copy(attr, uattr, prog_id,
fd_type, buf,
probe_offset,
probe_addr);
goto put_file;
}

err = -ENOTSUPP;
put_file:
fput(file);
out:
return err;
}

SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
Expand Down Expand Up @@ -2264,6 +2392,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
break;
case BPF_TASK_FD_QUERY:
err = bpf_task_fd_query(&attr, uattr);
break;
default:
err = -EINVAL;
break;
Expand Down
8 changes: 8 additions & 0 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
return file;
}

const struct perf_event *perf_get_event(struct file *file)
{
if (file->f_op != &perf_fops)
return ERR_PTR(-EINVAL);

return file->private_data;
}

const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
if (!event)
Expand Down
48 changes: 48 additions & 0 deletions kernel/trace/bpf_trace.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
#include <linux/syscalls.h>
#include <linux/error-injection.h>

#include "trace_probe.h"
Expand Down Expand Up @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
mutex_unlock(&bpf_event_mutex);
return err;
}

int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
u32 *fd_type, const char **buf,
u64 *probe_offset, u64 *probe_addr)
{
bool is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
int flags, err = 0;

prog = event->prog;
if (!prog)
return -ENOENT;

/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
return -EOPNOTSUPP;

*prog_id = prog->aux->id;
flags = event->tp_event->flags;
is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);

if (is_tracepoint || is_syscall_tp) {
*buf = is_tracepoint ? event->tp_event->tp->name
: event->tp_event->name;
*fd_type = BPF_FD_TYPE_TRACEPOINT;
*probe_offset = 0x0;
*probe_addr = 0x0;
} else {
/* kprobe/uprobe */
err = -EOPNOTSUPP;
#ifdef CONFIG_KPROBE_EVENTS
if (flags & TRACE_EVENT_FL_KPROBE)
err = bpf_get_kprobe_info(event, fd_type, buf,
probe_offset, probe_addr,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
#ifdef CONFIG_UPROBE_EVENTS
if (flags & TRACE_EVENT_FL_UPROBE)
err = bpf_get_uprobe_info(event, fd_type, buf,
probe_offset,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
}

return err;
}
29 changes: 29 additions & 0 deletions kernel/trace/trace_kprobe.c
Original file line number Diff line number Diff line change
Expand Up @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);

int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
const char **symbol, u64 *probe_offset,
u64 *probe_addr, bool perf_type_tracepoint)
{
const char *pevent = trace_event_name(event->tp_event);
const char *group = event->tp_event->class->system;
struct trace_kprobe *tk;

if (perf_type_tracepoint)
tk = find_trace_kprobe(pevent, group);
else
tk = event->tp_event->data;
if (!tk)
return -EINVAL;

*fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
: BPF_FD_TYPE_KPROBE;
if (tk->symbol) {
*symbol = tk->symbol;
*probe_offset = tk->rp.kp.offset;
*probe_addr = 0;
} else {
*symbol = NULL;
*probe_offset = 0;
*probe_addr = (unsigned long)tk->rp.kp.addr;
}
return 0;
}
#endif /* CONFIG_PERF_EVENTS */

/*
Expand Down
22 changes: 22 additions & 0 deletions kernel/trace/trace_uprobe.c
Original file line number Diff line number Diff line change
Expand Up @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
{
__uprobe_perf_func(tu, func, regs, ucb, dsize);
}

int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
const char **filename, u64 *probe_offset,
bool perf_type_tracepoint)
{
const char *pevent = trace_event_name(event->tp_event);
const char *group = event->tp_event->class->system;
struct trace_uprobe *tu;

if (perf_type_tracepoint)
tu = find_probe_event(pevent, group);
else
tu = event->tp_event->data;
if (!tu)
return -EINVAL;

*fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
: BPF_FD_TYPE_UPROBE;
*filename = tu->filename;
*probe_offset = tu->offset;
return 0;
}
#endif /* CONFIG_PERF_EVENTS */

static int
Expand Down
Loading

0 comments on commit f80acbd

Please sign in to comment.