From 997849c4b969034e225153f41026657def66d286 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 15 Feb 2023 16:21:31 +0800 Subject: [PATCH 1/2] bpf: Zeroing allocated object from slab in bpf memory allocator Currently the freed element in bpf memory allocator may be immediately reused, for htab map the reuse will reinitialize special fields in map value (e.g., bpf_spin_lock), but lookup procedure may still access these special fields, and it may lead to hard-lockup as shown below: NMI backtrace for cpu 16 CPU: 16 PID: 2574 Comm: htab.bin Tainted: G L 6.1.0+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), RIP: 0010:queued_spin_lock_slowpath+0x283/0x2c0 ...... Call Trace: copy_map_value_locked+0xb7/0x170 bpf_map_copy_value+0x113/0x3c0 __sys_bpf+0x1c67/0x2780 __x64_sys_bpf+0x1c/0x20 do_syscall_64+0x30/0x60 entry_SYSCALL_64_after_hwframe+0x46/0xb0 ...... For htab map, just like the preallocated case, these is no need to initialize these special fields in map value again once these fields have been initialized. For preallocated htab map, these fields are initialized through __GFP_ZERO in bpf_map_area_alloc(), so do the similar thing for non-preallocated htab in bpf memory allocator. And there is no need to use __GFP_ZERO for per-cpu bpf memory allocator, because __alloc_percpu_gfp() does it implicitly. Fixes: 0fd7c5d43339 ("bpf: Optimize call_rcu in non-preallocated hash map.") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230215082132.3856544-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ kernel/bpf/hashtab.c | 4 ++-- kernel/bpf/memalloc.c | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be34f7deb6c39..520b238abd5a2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -363,6 +363,13 @@ static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); } +/* 'dst' must be a temporary buffer and should not point to memory that is being + * used in parallel by a bpf program or bpf syscall, otherwise the access from + * the bpf program or bpf syscall may be corrupted by the reinitialization, + * leading to weird problems. Even 'dst' is newly-allocated from bpf memory + * allocator, it is still possible for 'dst' to be used in parallel by a bpf + * program or bpf syscall. + */ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { bpf_obj_init(map->field_offs, dst); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 66bded1443773..5dfcb5ad0d068 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1004,8 +1004,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } - check_and_init_map_value(&htab->map, - l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); @@ -1592,6 +1590,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, else copy_map_value(map, value, l->key + roundup_key_size); + /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } @@ -1792,6 +1791,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, true); else copy_map_value(map, dst_val, value); + /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, dst_val); } if (do_delete) { diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 490d03a4581aa..5fcdacbb84394 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -143,7 +143,7 @@ static void *__alloc(struct bpf_mem_cache *c, int node) return obj; } - return kmalloc_node(c->unit_size, flags, node); + return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); } static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) From f88da2d46cc9a19b0c233285339659cae36c5d9a Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 15 Feb 2023 16:21:32 +0800 Subject: [PATCH 2/2] selftests/bpf: Add test case for element reuse in htab map The reinitialization of spin-lock in map value after immediate reuse may corrupt lookup with BPF_F_LOCK flag and result in hard lock-up, so add one test case to demonstrate the problem. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230215082132.3856544-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/htab_reuse.c | 101 ++++++++++++++++++ .../testing/selftests/bpf/progs/htab_reuse.c | 19 ++++ 2 files changed, 120 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/htab_reuse.c create mode 100644 tools/testing/selftests/bpf/progs/htab_reuse.c diff --git a/tools/testing/selftests/bpf/prog_tests/htab_reuse.c b/tools/testing/selftests/bpf/prog_tests/htab_reuse.c new file mode 100644 index 0000000000000..a742dd994d600 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/htab_reuse.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include +#include +#include +#include "htab_reuse.skel.h" + +struct htab_op_ctx { + int fd; + int loop; + bool stop; +}; + +struct htab_val { + unsigned int lock; + unsigned int data; +}; + +static void *htab_lookup_fn(void *arg) +{ + struct htab_op_ctx *ctx = arg; + int i = 0; + + while (i++ < ctx->loop && !ctx->stop) { + struct htab_val value; + unsigned int key; + + /* Use BPF_F_LOCK to use spin-lock in map value. */ + key = 7; + bpf_map_lookup_elem_flags(ctx->fd, &key, &value, BPF_F_LOCK); + } + + return NULL; +} + +static void *htab_update_fn(void *arg) +{ + struct htab_op_ctx *ctx = arg; + int i = 0; + + while (i++ < ctx->loop && !ctx->stop) { + struct htab_val value; + unsigned int key; + + key = 7; + value.lock = 0; + value.data = key; + bpf_map_update_elem(ctx->fd, &key, &value, BPF_F_LOCK); + bpf_map_delete_elem(ctx->fd, &key); + + key = 24; + value.lock = 0; + value.data = key; + bpf_map_update_elem(ctx->fd, &key, &value, BPF_F_LOCK); + bpf_map_delete_elem(ctx->fd, &key); + } + + return NULL; +} + +void test_htab_reuse(void) +{ + unsigned int i, wr_nr = 1, rd_nr = 4; + pthread_t tids[wr_nr + rd_nr]; + struct htab_reuse *skel; + struct htab_op_ctx ctx; + int err; + + skel = htab_reuse__open_and_load(); + if (!ASSERT_OK_PTR(skel, "htab_reuse__open_and_load")) + return; + + ctx.fd = bpf_map__fd(skel->maps.htab); + ctx.loop = 500; + ctx.stop = false; + + memset(tids, 0, sizeof(tids)); + for (i = 0; i < wr_nr; i++) { + err = pthread_create(&tids[i], NULL, htab_update_fn, &ctx); + if (!ASSERT_OK(err, "pthread_create")) { + ctx.stop = true; + goto reap; + } + } + for (i = 0; i < rd_nr; i++) { + err = pthread_create(&tids[i + wr_nr], NULL, htab_lookup_fn, &ctx); + if (!ASSERT_OK(err, "pthread_create")) { + ctx.stop = true; + goto reap; + } + } + +reap: + for (i = 0; i < wr_nr + rd_nr; i++) { + if (!tids[i]) + continue; + pthread_join(tids[i], NULL); + } + htab_reuse__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/htab_reuse.c b/tools/testing/selftests/bpf/progs/htab_reuse.c new file mode 100644 index 0000000000000..7f7368cb30953 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/htab_reuse.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct htab_val { + struct bpf_spin_lock lock; + unsigned int data; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 64); + __type(key, unsigned int); + __type(value, struct htab_val); + __uint(map_flags, BPF_F_NO_PREALLOC); +} htab SEC(".maps");