Skip to content

Commit

Permalink
Merge branch 'bpf-sleepable'
Browse files Browse the repository at this point in the history
Alexei Starovoitov says:

====================
v2->v3:
- switched to minimal allowlist approach. Essentially that means that syscall
  entry, few btrfs allow_error_inject functions, should_fail_bio(), and two LSM
  hooks: file_mprotect and bprm_committed_creds are the only hooks that allow
  attaching of sleepable BPF programs. When comprehensive analysis of LSM hooks
  will be done this allowlist will be extended.
- added patch 1 that fixes prototypes of two mm functions to reliably work with
  error injection. It's also necessary for resolve_btfids tool to recognize
  these two funcs, but that's secondary.

v1->v2:
- split fmod_ret fix into separate patch
- added denylist

v1:
This patch set introduces the minimal viable support for sleepable bpf programs.
In this patch only fentry/fexit/fmod_ret and lsm progs can be sleepable.
Only array and pre-allocated hash and lru maps allowed.

Here is 'perf report' difference of sleepable vs non-sleepable:
   3.86%  bench     [k] __srcu_read_unlock
   3.22%  bench     [k] __srcu_read_lock
   0.92%  bench     [k] bpf_prog_740d4210cdcd99a3_bench_trigger_fentry_sleep
   0.50%  bench     [k] bpf_trampoline_10297
   0.26%  bench     [k] __bpf_prog_exit_sleepable
   0.21%  bench     [k] __bpf_prog_enter_sleepable
vs
   0.88%  bench     [k] bpf_prog_740d4210cdcd99a3_bench_trigger_fentry
   0.84%  bench     [k] bpf_trampoline_10297
   0.13%  bench     [k] __bpf_prog_enter
   0.12%  bench     [k] __bpf_prog_exit
vs
   0.79%  bench     [k] bpf_prog_740d4210cdcd99a3_bench_trigger_fentry_sleep
   0.72%  bench     [k] bpf_trampoline_10381
   0.31%  bench     [k] __bpf_prog_exit_sleepable
   0.29%  bench     [k] __bpf_prog_enter_sleepable

Sleepable vs non-sleepable program invocation overhead is only marginally higher
due to rcu_trace. srcu approach is much slower.
====================

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
  • Loading branch information
Daniel Borkmann committed Aug 28, 2020
2 parents d557ea3 + e68a144 commit 10496f2
Show file tree
Hide file tree
Showing 20 changed files with 331 additions and 33 deletions.
32 changes: 21 additions & 11 deletions arch/x86/net/bpf_jit_comp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1379,10 +1379,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
u8 *prog = *pprog;
int cnt = 0;

if (emit_call(&prog, __bpf_prog_enter, prog))
return -EINVAL;
/* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
if (p->aux->sleepable) {
if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
return -EINVAL;
} else {
if (emit_call(&prog, __bpf_prog_enter, prog))
return -EINVAL;
/* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
}

/* arg1: lea rdi, [rbp - stack_size] */
EMIT4(0x48, 0x8D, 0x7D, -stack_size);
Expand All @@ -1402,13 +1407,18 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (mod_ret)
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);

/* arg1: mov rdi, progs[i] */
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
(u32) (long) p);
/* arg2: mov rsi, rbx <- start time in nsec */
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
if (emit_call(&prog, __bpf_prog_exit, prog))
return -EINVAL;
if (p->aux->sleepable) {
if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
return -EINVAL;
} else {
/* arg1: mov rdi, progs[i] */
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
(u32) (long) p);
/* arg2: mov rsi, rbx <- start time in nsec */
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
if (emit_call(&prog, __bpf_prog_exit, prog))
return -EINVAL;
}

*pprog = prog;
return 0;
Expand Down
4 changes: 4 additions & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
/* these two functions are called from generated trampoline */
u64 notrace __bpf_prog_enter(void);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
void notrace __bpf_prog_enter_sleepable(void);
void notrace __bpf_prog_exit_sleepable(void);

struct bpf_ksym {
unsigned long start;
Expand Down Expand Up @@ -734,6 +736,7 @@ struct bpf_prog_aux {
bool offload_requested;
bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
bool func_proto_unreliable;
bool sleepable;
enum bpf_tramp_prog_type trampoline_prog_type;
struct bpf_trampoline *trampoline;
struct hlist_node tramp_hlist;
Expand Down Expand Up @@ -1781,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
extern const struct bpf_func_proto bpf_copy_from_user_proto;

const struct bpf_func_proto *bpf_tracing_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog);
Expand Down
16 changes: 16 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,14 @@ enum bpf_link_type {
/* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3)

/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
* restrict map and helper usage for such programs. Sleepable BPF programs can
* only be attached to hooks where kernel execution context allows sleeping.
* Such programs are allowed to use helpers that may sleep like
* bpf_copy_from_user().
*/
#define BPF_F_SLEEPABLE (1U << 4)

/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
Expand Down Expand Up @@ -3561,6 +3569,13 @@ union bpf_attr {
* On success, the strictly positive length of the string,
* including the trailing NUL character. On error, a negative
* value.
*
* long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
* Description
* Read *size* bytes from user space address *user_ptr* and store
* the data in *dst*. This is a wrapper of copy_from_user().
* Return
* 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
Expand Down Expand Up @@ -3711,6 +3726,7 @@ union bpf_attr {
FN(inode_storage_get), \
FN(inode_storage_delete), \
FN(d_path), \
FN(copy_from_user), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
Expand Down
1 change: 1 addition & 0 deletions init/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -1691,6 +1691,7 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
select TASKS_TRACE_RCU
default n
help
Enable the bpf() system call that allows to manipulate eBPF
Expand Down
1 change: 1 addition & 0 deletions kernel/bpf/arraymap.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>

#include "map_in_map.h"

Expand Down
12 changes: 6 additions & 6 deletions kernel/bpf/hashtab.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
Expand Down Expand Up @@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;

/* Must be called with rcu_read_lock. */
WARN_ON_ONCE(!rcu_read_lock_held());
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());

key_size = map->key_size;

Expand Down Expand Up @@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;

WARN_ON_ONCE(!rcu_read_lock_held());
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());

key_size = map->key_size;

Expand Down Expand Up @@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;

WARN_ON_ONCE(!rcu_read_lock_held());
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());

key_size = map->key_size;

Expand Down Expand Up @@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret = -ENOENT;

WARN_ON_ONCE(!rcu_read_lock_held());
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());

key_size = map->key_size;

Expand Down Expand Up @@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret = -ENOENT;

WARN_ON_ONCE(!rcu_read_lock_held());
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());

key_size = map->key_size;

Expand Down
22 changes: 22 additions & 0 deletions kernel/bpf/helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,28 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
const void __user *, user_ptr)
{
int ret = copy_from_user(dst, user_ptr, size);

if (unlikely(ret)) {
memset(dst, 0, size);
ret = -EFAULT;
}

return ret;
}

const struct bpf_func_proto bpf_copy_from_user_proto = {
.func = bpf_copy_from_user,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};

const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
Expand Down
13 changes: 10 additions & 3 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>

#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
Expand Down Expand Up @@ -1731,10 +1732,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
btf_put(prog->aux->btf);
bpf_prog_free_linfo(prog);

if (deferred)
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
else
if (deferred) {
if (prog->aux->sleepable)
call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
else
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
} else {
__bpf_prog_put_rcu(&prog->aux->rcu);
}
}

static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
Expand Down Expand Up @@ -2104,6 +2109,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32))
return -EINVAL;

Expand Down Expand Up @@ -2159,6 +2165,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
}

prog->aux->offload_requested = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;

err = security_bpf_prog_alloc(prog->aux);
if (err)
Expand Down
28 changes: 25 additions & 3 deletions kernel/bpf/trampoline.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <linux/rbtree_latch.h>
#include <linux/perf_event.h>
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>

/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
Expand Down Expand Up @@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
* updates to trampoline would change the code from underneath the
* preempted task. Hence wait for tasks to voluntarily schedule or go
* to userspace.
* The same trampoline can hold both sleepable and non-sleepable progs.
* synchronize_rcu_tasks_trace() is needed to make sure all sleepable
* programs finish executing.
* Wait for these two grace periods together.
*/

synchronize_rcu_tasks();
synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);

err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
&tr->func.model, flags, tprogs,
Expand Down Expand Up @@ -344,7 +349,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
bpf_image_ksym_del(&tr->ksym);
/* wait for tasks to get out of trampoline before freeing it */
/* This code will be executed when all bpf progs (both sleepable and
* non-sleepable) went through
* bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
* Hence no need for another synchronize_rcu_tasks_trace() here,
* but synchronize_rcu_tasks() is still needed, since trampoline
* may not have had any sleepable programs and we need to wait
* for tasks to get out of trampoline code before freeing it.
*/
synchronize_rcu_tasks();
bpf_jit_free_exec(tr->image);
hlist_del(&tr->hlist);
Expand Down Expand Up @@ -394,6 +406,16 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
rcu_read_unlock();
}

void notrace __bpf_prog_enter_sleepable(void)
{
rcu_read_lock_trace();
}

void notrace __bpf_prog_exit_sleepable(void)
{
rcu_read_unlock_trace();
}

int __weak
arch_prepare_bpf_trampoline(void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
Expand Down
Loading

0 comments on commit 10496f2

Please sign in to comment.