Skip to content

Commit

Permalink
Merge branch 'Add support for kptrs in more BPF maps'
Browse files Browse the repository at this point in the history
Kumar Kartikeya Dwivedi says:

====================

This set adds support for kptrs in percpu hashmaps, percpu LRU hashmaps,
and local storage maps (covering sk, cgrp, task, inode).

Tests are expanded to test more existing maps at runtime and also test
the code path for the local storage maps (which is shared by all
implementations).

A question for reviewers is what the position of the BPF runtime should
be on dealing with reference cycles that can be created by BPF programs
at runtime using this additional support. For instance, one can store
the kptr of the task in its own task local storage, creating a cycle
which prevents destruction of task local storage. Cycles can be formed
using arbitrarily long kptr ownership chains. Therefore, just preventing
storage of such kptrs in some maps is not a sufficient solution, and is
more likely to hurt usability.

There is precedence in existing runtimes which promise memory safety,
like Rust, where reference cycles and memory leaks are permitted.
However, traditionally the safety guarantees of BPF have been stronger.
Thus, more discussion and thought is invited on this topic to ensure we
cover all usage aspects.

Changelog:
----------
v2 -> v3
v2: https://lore.kernel.org/bpf/20230221200646.2500777-1-memxor@gmail.com/

 * Fix a use-after-free bug in local storage patch
 * Fix selftest for aarch64 (don't use fentry/fmod_ret)
 * Wait for RCU Tasks Trace GP along with RCU GP in selftest

v1 -> v2
v1: https://lore.kernel.org/bpf/20230219155249.1755998-1-memxor@gmail.com

 * Simplify selftests, fix a couple of bugs
====================

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
  • Loading branch information
Alexei Starovoitov committed Mar 1, 2023
2 parents c4b5c5b + 85521e1 commit 6c18e37
Show file tree
Hide file tree
Showing 8 changed files with 553 additions and 96 deletions.
6 changes: 6 additions & 0 deletions include/linux/bpf_local_storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ struct bpf_local_storage_elem {
struct hlist_node snode; /* Linked to bpf_local_storage */
struct bpf_local_storage __rcu *local_storage;
struct rcu_head rcu;
bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU
* callbacks? bpf_local_storage_map_free only
* executes rcu_barrier when there are special
* fields, this field remembers that to ensure we
* don't access already freed smap in sdata.
*/
/* 8 bytes hole */
/* The data is stored in another cacheline to minimize
* the number of cachelines access during a cache hit.
Expand Down
48 changes: 44 additions & 4 deletions kernel/bpf/bpf_local_storage.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (selem) {
if (value)
copy_map_value(&smap->map, SDATA(selem)->data, value);
/* No need to call check_and_init_map_value as memory is zero init */
return selem;
}

Expand Down Expand Up @@ -113,10 +114,25 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
struct bpf_local_storage_elem *selem;

selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
/* The can_use_smap bool is set whenever we need to free additional
* fields in selem data before freeing selem. bpf_local_storage_map_free
* only executes rcu_barrier to wait for RCU callbacks when it has
* special fields, hence we can only conditionally dereference smap, as
* by this time the map might have already been freed without waiting
* for our call_rcu callback if it did not have any special fields.
*/
if (selem->can_use_smap)
bpf_obj_free_fields(SDATA(selem)->smap->map.record, SDATA(selem)->data);
kfree(selem);
}

static void bpf_selem_free_tasks_trace_rcu(struct rcu_head *rcu)
{
/* Free directly if Tasks Trace RCU GP also implies RCU GP */
if (rcu_trace_implies_rcu_gp())
kfree(selem);
bpf_selem_free_rcu(rcu);
else
kfree_rcu(selem, rcu);
call_rcu(rcu, bpf_selem_free_rcu);
}

/* local_storage->lock must be held and selem->local_storage == local_storage.
Expand Down Expand Up @@ -170,9 +186,9 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);

if (use_trace_rcu)
call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_tasks_trace_rcu);
else
kfree_rcu(selem, rcu);
call_rcu(&selem->rcu, bpf_selem_free_rcu);

return free_local_storage;
}
Expand Down Expand Up @@ -240,6 +256,11 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
hlist_add_head_rcu(&selem->map_node, &b->list);
raw_spin_unlock_irqrestore(&b->lock, flags);

/* If our data will have special fields, smap will wait for us to use
* its record in bpf_selem_free_* RCU callbacks before freeing itself.
*/
selem->can_use_smap = !IS_ERR_OR_NULL(smap->map.record);
}

void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
Expand Down Expand Up @@ -723,6 +744,25 @@ void bpf_local_storage_map_free(struct bpf_map *map,
*/
synchronize_rcu();

/* Only delay freeing of smap, buckets are not needed anymore */
kvfree(smap->buckets);

/* When local storage has special fields, callbacks for
* bpf_selem_free_rcu and bpf_selem_free_tasks_trace_rcu will keep using
* the map BTF record, we need to execute an RCU barrier to wait for
* them as the record will be freed right after our map_free callback.
*/
if (!IS_ERR_OR_NULL(smap->map.record)) {
rcu_barrier_tasks_trace();
/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
* is true, because while call_rcu invocation is skipped in that
* case in bpf_selem_free_tasks_trace_rcu (and all local storage
* maps pass use_trace_rcu = true), there can be call_rcu
* callbacks based on use_trace_rcu = false in the earlier while
* ((selem = ...)) loop or from bpf_local_storage_unlink_nolock
* called from owner's free path.
*/
rcu_barrier();
}
bpf_map_area_free(smap);
}
59 changes: 37 additions & 22 deletions kernel/bpf/hashtab.c
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,18 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)
struct htab_elem *elem;

elem = get_htab_elem(htab, i);
bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
if (htab_is_percpu(htab)) {
void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
int cpu;

for_each_possible_cpu(cpu) {
bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
cond_resched();
}
} else {
bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
cond_resched();
}
}
Expand Down Expand Up @@ -759,9 +770,17 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
static void check_and_free_fields(struct bpf_htab *htab,
struct htab_elem *elem)
{
void *map_value = elem->key + round_up(htab->map.key_size, 8);
if (htab_is_percpu(htab)) {
void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
int cpu;

bpf_obj_free_fields(htab->map.record, map_value);
for_each_possible_cpu(cpu)
bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
} else {
void *map_value = elem->key + round_up(htab->map.key_size, 8);

bpf_obj_free_fields(htab->map.record, map_value);
}
}

/* It is called from the bpf_lru_list when the LRU needs to delete
Expand Down Expand Up @@ -858,9 +877,9 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)

static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
check_and_free_fields(htab, l);
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
check_and_free_fields(htab, l);
bpf_mem_cache_free(&htab->ma, l);
}

Expand Down Expand Up @@ -918,14 +937,13 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
{
if (!onallcpus) {
/* copy true value_size bytes */
memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;

for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
value + off, size);
copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
off += size;
}
}
Expand All @@ -940,16 +958,14 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
* (onallcpus=false always when coming from bpf prog).
*/
if (!onallcpus) {
u32 size = round_up(htab->map.value_size, 8);
int current_cpu = raw_smp_processor_id();
int cpu;

for_each_possible_cpu(cpu) {
if (cpu == current_cpu)
bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
size);
else
memset(per_cpu_ptr(pptr, cpu), 0, size);
copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
else /* Since elem is preallocated, we cannot touch special fields */
zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
} else {
pcpu_copy_value(htab, pptr, value, onallcpus);
Expand Down Expand Up @@ -1575,9 +1591,8 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,

pptr = htab_elem_get_ptr(l, key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off,
per_cpu_ptr(pptr, cpu),
roundup_value_size);
copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(&htab->map, value + off);
off += roundup_value_size;
}
} else {
Expand Down Expand Up @@ -1772,8 +1787,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,

pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(dst_val + off,
per_cpu_ptr(pptr, cpu), size);
copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(&htab->map, dst_val + off);
off += size;
}
} else {
Expand Down Expand Up @@ -2046,9 +2061,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
roundup_value_size = round_up(map->value_size, 8);
pptr = htab_elem_get_ptr(elem, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(info->percpu_value_buf + off,
per_cpu_ptr(pptr, cpu),
roundup_value_size);
copy_map_value_long(map, info->percpu_value_buf + off,
per_cpu_ptr(pptr, cpu));
check_and_init_map_value(map, info->percpu_value_buf + off);
off += roundup_value_size;
}
ctx.value = info->percpu_value_buf;
Expand Down Expand Up @@ -2292,8 +2307,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
*/
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off,
per_cpu_ptr(pptr, cpu), size);
copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(map, value + off);
off += size;
}
ret = 0;
Expand Down
8 changes: 7 additions & 1 deletion kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -1059,9 +1059,15 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
Expand Down
12 changes: 8 additions & 4 deletions kernel/bpf/verifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -7222,22 +7222,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_MAP_TYPE_SK_STORAGE:
if (func_id != BPF_FUNC_sk_storage_get &&
func_id != BPF_FUNC_sk_storage_delete)
func_id != BPF_FUNC_sk_storage_delete &&
func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_INODE_STORAGE:
if (func_id != BPF_FUNC_inode_storage_get &&
func_id != BPF_FUNC_inode_storage_delete)
func_id != BPF_FUNC_inode_storage_delete &&
func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_TASK_STORAGE:
if (func_id != BPF_FUNC_task_storage_get &&
func_id != BPF_FUNC_task_storage_delete)
func_id != BPF_FUNC_task_storage_delete &&
func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_CGRP_STORAGE:
if (func_id != BPF_FUNC_cgrp_storage_get &&
func_id != BPF_FUNC_cgrp_storage_delete)
func_id != BPF_FUNC_cgrp_storage_delete &&
func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_BLOOM_FILTER:
Expand Down
Loading

0 comments on commit 6c18e37

Please sign in to comment.