Skip to content

Commit

Permalink
bpf: Add poke dependency tracking for prog array maps
Browse files Browse the repository at this point in the history
This work adds program tracking to prog array maps. This is needed such
that upon prog array updates/deletions we can fix up all programs which
make use of this tail call map. We add ops->map_poke_{un,}track()
helpers to maps to maintain the list of programs and ops->map_poke_run()
for triggering the actual update.

bpf_array_aux is extended to contain the list head and poke_mutex in
order to serialize program patching during updates/deletions.
bpf_free_used_maps() will untrack the program shortly before dropping
the reference to the map. For clearing out the prog array once all urefs
are dropped we need to use schedule_work() to have a sleepable context.

The prog_array_map_poke_run() is triggered during updates/deletions and
walks the maintained prog list. It checks in their poke_tabs whether the
map and key is matching and runs the actual bpf_arch_text_poke() for
patching in the nop or new jmp location. Depending on the type of update,
we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net
  • Loading branch information
Daniel Borkmann authored and Alexei Starovoitov committed Nov 25, 2019
1 parent a66886f commit da765a2
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 12 deletions.
12 changes: 12 additions & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ struct bpf_verifier_env;
struct bpf_verifier_log;
struct perf_event;
struct bpf_prog;
struct bpf_prog_aux;
struct bpf_map;
struct sock;
struct seq_file;
Expand Down Expand Up @@ -64,6 +65,12 @@ struct bpf_map_ops {
const struct btf_type *key_type,
const struct btf_type *value_type);

/* Prog poke tracking helpers. */
int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
struct bpf_prog *new);

/* Direct value access helpers. */
int (*map_direct_value_addr)(const struct bpf_map *map,
u64 *imm, u32 off);
Expand Down Expand Up @@ -588,6 +595,11 @@ struct bpf_array_aux {
*/
enum bpf_prog_type type;
bool jited;
/* Programs with direct jumps into programs part of this array. */
struct list_head poke_progs;
struct bpf_map *map;
struct mutex poke_mutex;
struct work_struct work;
};

struct bpf_array {
Expand Down
183 changes: 179 additions & 4 deletions kernel/bpf/arraymap.c
Original file line number Diff line number Diff line change
Expand Up @@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);

old_ptr = xchg(array->ptrs + index, new_ptr);
if (map->ops->map_poke_run) {
mutex_lock(&array->aux->poke_mutex);
old_ptr = xchg(array->ptrs + index, new_ptr);
map->ops->map_poke_run(map, index, old_ptr, new_ptr);
mutex_unlock(&array->aux->poke_mutex);
} else {
old_ptr = xchg(array->ptrs + index, new_ptr);
}

if (old_ptr)
map->ops->map_fd_put_ptr(old_ptr);

return 0;
}

Expand All @@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
if (index >= array->map.max_entries)
return -E2BIG;

old_ptr = xchg(array->ptrs + index, NULL);
if (map->ops->map_poke_run) {
mutex_lock(&array->aux->poke_mutex);
old_ptr = xchg(array->ptrs + index, NULL);
map->ops->map_poke_run(map, index, old_ptr, NULL);
mutex_unlock(&array->aux->poke_mutex);
} else {
old_ptr = xchg(array->ptrs + index, NULL);
}

if (old_ptr) {
map->ops->map_fd_put_ptr(old_ptr);
return 0;
Expand Down Expand Up @@ -671,6 +686,152 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}

struct prog_poke_elem {
struct list_head list;
struct bpf_prog_aux *aux;
};

static int prog_array_map_poke_track(struct bpf_map *map,
struct bpf_prog_aux *prog_aux)
{
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
int ret = 0;

aux = container_of(map, struct bpf_array, map)->aux;
mutex_lock(&aux->poke_mutex);
list_for_each_entry(elem, &aux->poke_progs, list) {
if (elem->aux == prog_aux)
goto out;
}

elem = kmalloc(sizeof(*elem), GFP_KERNEL);
if (!elem) {
ret = -ENOMEM;
goto out;
}

INIT_LIST_HEAD(&elem->list);
/* We must track the program's aux info at this point in time
* since the program pointer itself may not be stable yet, see
* also comment in prog_array_map_poke_run().
*/
elem->aux = prog_aux;

list_add_tail(&elem->list, &aux->poke_progs);
out:
mutex_unlock(&aux->poke_mutex);
return ret;
}

static void prog_array_map_poke_untrack(struct bpf_map *map,
struct bpf_prog_aux *prog_aux)
{
struct prog_poke_elem *elem, *tmp;
struct bpf_array_aux *aux;

aux = container_of(map, struct bpf_array, map)->aux;
mutex_lock(&aux->poke_mutex);
list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
if (elem->aux == prog_aux) {
list_del_init(&elem->list);
kfree(elem);
break;
}
}
mutex_unlock(&aux->poke_mutex);
}

static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
{
enum bpf_text_poke_type type;
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;

if (!old && new)
type = BPF_MOD_NOP_TO_JUMP;
else if (old && !new)
type = BPF_MOD_JUMP_TO_NOP;
else if (old && new)
type = BPF_MOD_JUMP_TO_JUMP;
else
return;

aux = container_of(map, struct bpf_array, map)->aux;
WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));

list_for_each_entry(elem, &aux->poke_progs, list) {
struct bpf_jit_poke_descriptor *poke;
int i, ret;

for (i = 0; i < elem->aux->size_poke_tab; i++) {
poke = &elem->aux->poke_tab[i];

/* Few things to be aware of:
*
* 1) We can only ever access aux in this context, but
* not aux->prog since it might not be stable yet and
* there could be danger of use after free otherwise.
* 2) Initially when we start tracking aux, the program
* is not JITed yet and also does not have a kallsyms
* entry. We skip these as poke->ip_stable is not
* active yet. The JIT will do the final fixup before
* setting it stable. The various poke->ip_stable are
* successively activated, so tail call updates can
* arrive from here while JIT is still finishing its
* final fixup for non-activated poke entries.
* 3) On program teardown, the program's kallsym entry gets
* removed out of RCU callback, but we can only untrack
* from sleepable context, therefore bpf_arch_text_poke()
* might not see that this is in BPF text section and
* bails out with -EINVAL. As these are unreachable since
* RCU grace period already passed, we simply skip them.
* 4) Also programs reaching refcount of zero while patching
* is in progress is okay since we're protected under
* poke_mutex and untrack the programs before the JIT
* buffer is freed. When we're still in the middle of
* patching and suddenly kallsyms entry of the program
* gets evicted, we just skip the rest which is fine due
* to point 3).
* 5) Any other error happening below from bpf_arch_text_poke()
* is a unexpected bug.
*/
if (!READ_ONCE(poke->ip_stable))
continue;
if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
continue;
if (poke->tail_call.map != map ||
poke->tail_call.key != key)
continue;

ret = bpf_arch_text_poke(poke->ip, type,
old ? (u8 *)old->bpf_func +
poke->adj_off : NULL,
new ? (u8 *)new->bpf_func +
poke->adj_off : NULL);
BUG_ON(ret < 0 && ret != -EINVAL);
}
}
}

static void prog_array_map_clear_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_array_aux,
work)->map;
bpf_fd_array_map_clear(map);
bpf_map_put(map);
}

static void prog_array_map_clear(struct bpf_map *map)
{
struct bpf_array_aux *aux = container_of(map, struct bpf_array,
map)->aux;
bpf_map_inc(map);
schedule_work(&aux->work);
}

static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
{
struct bpf_array_aux *aux;
Expand All @@ -680,21 +841,32 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
if (!aux)
return ERR_PTR(-ENOMEM);

INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);

map = array_map_alloc(attr);
if (IS_ERR(map)) {
kfree(aux);
return map;
}

container_of(map, struct bpf_array, map)->aux = aux;
aux->map = map;

return map;
}

static void prog_array_map_free(struct bpf_map *map)
{
struct prog_poke_elem *elem, *tmp;
struct bpf_array_aux *aux;

aux = container_of(map, struct bpf_array, map)->aux;
list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
list_del_init(&elem->list);
kfree(elem);
}
kfree(aux);
fd_array_map_free(map);
}
Expand All @@ -703,13 +875,16 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = prog_array_map_alloc,
.map_free = prog_array_map_free,
.map_poke_track = prog_array_map_poke_track,
.map_poke_untrack = prog_array_map_poke_untrack,
.map_poke_run = prog_array_map_poke_run,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = bpf_fd_array_map_clear,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
};

Expand Down
9 changes: 7 additions & 2 deletions kernel/bpf/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -2050,11 +2050,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)

static void bpf_free_used_maps(struct bpf_prog_aux *aux)
{
struct bpf_map *map;
int i;

bpf_free_cgroup_storage(aux);
for (i = 0; i < aux->used_map_cnt; i++)
bpf_map_put(aux->used_maps[i]);
for (i = 0; i < aux->used_map_cnt; i++) {
map = aux->used_maps[i];
if (map->ops->map_poke_untrack)
map->ops->map_poke_untrack(map, aux);
bpf_map_put(map);
}
kfree(aux->used_maps);
}

Expand Down
20 changes: 14 additions & 6 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@
#include <linux/nospec.h>
#include <uapi/linux/btf.h>

#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
IS_FD_HASH(map))

#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)

Expand Down Expand Up @@ -877,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_cgroup_storage_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
} else if (IS_FD_ARRAY(map)) {
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
err = bpf_fd_htab_map_lookup_elem(map, key, value);
Expand Down Expand Up @@ -1004,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr)
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
} else if (IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
attr->flags);
goto out;
}

/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
Expand Down Expand Up @@ -1086,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map)) {
err = map->ops->map_delete_elem(map, key);
goto out;
}

preempt_disable();
Expand Down

0 comments on commit da765a2

Please sign in to comment.