From 11f522256e9043b0fcd2f994278645d3e201d20c Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 8 Feb 2024 15:31:15 +0530 Subject: [PATCH 01/11] bpf: Fix warning for bpf_cpumask in verifier Compiling with CONFIG_BPF_SYSCALL & !CONFIG_BPF_JIT throws the below warning: "WARN: resolve_btfids: unresolved symbol bpf_cpumask" Fix it by adding the appropriate #ifdef. Signed-off-by: Hari Bathini Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Acked-by: Stanislav Fomichev Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20240208100115.602172-1-hbathini@linux.ibm.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 65f598694d550..b263f093ee761 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5227,7 +5227,9 @@ BTF_ID(struct, prog_test_ref_kfunc) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) #endif +#ifdef CONFIG_BPF_JIT BTF_ID(struct, bpf_cpumask) +#endif BTF_ID(struct, task_struct) BTF_SET_END(rcu_protected_types) From 2127c604383666675789fd4a5fc2aead46c73aad Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 2 Feb 2024 17:32:20 +0100 Subject: [PATCH 02/11] xsk: Add truesize to skb_add_rx_frag(). xsk_build_skb() allocates a page and adds it to the skb via skb_add_rx_frag() and specifies 0 for truesize. This leads to a warning in skb_add_rx_frag() with CONFIG_DEBUG_NET enabled because size is larger than truesize. Increasing truesize requires to add the same amount to socket's sk_wmem_alloc counter in order not to underflow the counter during release in the destructor (sock_wfree()). Pass the size of the allocated page as truesize to skb_add_rx_frag(). Add this mount to socket's sk_wmem_alloc counter. Fixes: cf24f5a5feea ("xsk: add support for AF_XDP multi-buffer on Tx path") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20240202163221.2488589-1-bigeasy@linutronix.de --- net/xdp/xsk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1eadfac03cc41..b78c0e095e221 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -722,7 +722,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, memcpy(vaddr, buffer, len); kunmap_local(vaddr); - skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); + skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); + refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); } if (first_frag && desc->options & XDP_TX_METADATA) { From e37243b65d528a8a9f8b9a57a43885f8e8dfc15c Mon Sep 17 00:00:00 2001 From: Gianmarco Lusvardi Date: Tue, 13 Feb 2024 23:05:46 +0000 Subject: [PATCH 03/11] bpf, scripts: Correct GPL license name The bpf_doc script refers to the GPL as the "GNU Privacy License". I strongly suspect that the author wanted to refer to the GNU General Public License, under which the Linux kernel is released, as, to the best of my knowledge, there is no license named "GNU Privacy License". This patch corrects the license name in the script accordingly. Fixes: 56a092c89505 ("bpf: add script and prepare bpf.h for new helpers documentation") Signed-off-by: Gianmarco Lusvardi Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240213230544.930018-3-glusvardi@posteo.net --- scripts/bpf_doc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 61b7dddedc461..0669bac5e900e 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -513,7 +513,7 @@ def print_footer(self): instructions to the kernel when the programs are loaded. The format for that string is identical to the one in use for kernel modules (Dual licenses, such as "Dual BSD/GPL", may be used). Some helper functions are only accessible to -programs that are compatible with the GNU Privacy License (GPL). +programs that are compatible with the GNU General Public License (GNU GPL). In order to use such helpers, the eBPF program must be loaded with the correct license string passed (via **attr**) to the **bpf**\\ () system call, and this From ee0e39a63b78849f8abbef268b13e4838569f646 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 2 Feb 2024 18:39:33 +0800 Subject: [PATCH 04/11] x86/mm: Move is_vsyscall_vaddr() into asm/vsyscall.h Move is_vsyscall_vaddr() into asm/vsyscall.h to make it available for copy_from_kernel_nofault_allowed() in arch/x86/mm/maccess.c. Reviewed-by: Sohil Mehta Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20240202103935.3154011-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/vsyscall.h | 10 ++++++++++ arch/x86/mm/fault.c | 9 --------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index ab60a71a8dcb9..472f0263dbc61 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,6 +4,7 @@ #include #include +#include #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); @@ -24,4 +25,13 @@ static inline bool emulate_vsyscall(unsigned long error_code, } #endif +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static inline bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); +} + #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241c..d6375b3c633bc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -798,15 +798,6 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } -/* - * The (legacy) vsyscall page is the long page in the kernel portion - * of the address space that has user-accessible permissions. - */ -static bool is_vsyscall_vaddr(unsigned long vaddr) -{ - return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); -} - static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) From 32019c659ecfe1d92e3bf9fcdfbb11a7c70acd58 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 2 Feb 2024 18:39:34 +0800 Subject: [PATCH 05/11] x86/mm: Disallow vsyscall page read for copy_from_kernel_nofault() When trying to use copy_from_kernel_nofault() to read vsyscall page through a bpf program, the following oops was reported: BUG: unable to handle page fault for address: ffffffffff600000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 3231067 P4D 3231067 PUD 3233067 PMD 3235067 PTE 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 20390 Comm: test_progs ...... 6.7.0+ #58 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... RIP: 0010:copy_from_kernel_nofault+0x6f/0x110 ...... Call Trace: ? copy_from_kernel_nofault+0x6f/0x110 bpf_probe_read_kernel+0x1d/0x50 bpf_prog_2061065e56845f08_do_probe_read+0x51/0x8d trace_call_bpf+0xc5/0x1c0 perf_call_bpf_enter.isra.0+0x69/0xb0 perf_syscall_enter+0x13e/0x200 syscall_trace_enter+0x188/0x1c0 do_syscall_64+0xb5/0xe0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 ...... ---[ end trace 0000000000000000 ]--- The oops is triggered when: 1) A bpf program uses bpf_probe_read_kernel() to read from the vsyscall page and invokes copy_from_kernel_nofault() which in turn calls __get_user_asm(). 2) Because the vsyscall page address is not readable from kernel space, a page fault exception is triggered accordingly. 3) handle_page_fault() considers the vsyscall page address as a user space address instead of a kernel space address. This results in the fix-up setup by bpf not being applied and a page_fault_oops() is invoked due to SMAP. Considering handle_page_fault() has already considered the vsyscall page address as a userspace address, fix the problem by disallowing vsyscall page read for copy_from_kernel_nofault(). Originally-by: Thomas Gleixner Reported-by: syzbot+72aa0161922eba61b50e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/CAG48ez06TZft=ATH1qh2c5mpS5BT8UakwNkzi6nvK5_djC-4Nw@mail.gmail.com Reported-by: xingwei lee Closes: https://lore.kernel.org/bpf/CABOYnLynjBoFZOf3Z4BhaZkc5hx_kHfsjiW+UWLoB=w33LvScw@mail.gmail.com Signed-off-by: Hou Tao Reviewed-by: Sohil Mehta Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240202103935.3154011-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/x86/mm/maccess.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index 6993f026adec9..42115ac079cfe 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -3,6 +3,8 @@ #include #include +#include + #ifdef CONFIG_X86_64 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { @@ -15,6 +17,14 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) if (vaddr < TASK_SIZE_MAX + PAGE_SIZE) return false; + /* + * Reading from the vsyscall page may cause an unhandled fault in + * certain cases. Though it is at an address above TASK_SIZE_MAX, it is + * usually considered as a user space address. + */ + if (is_vsyscall_vaddr(vaddr)) + return false; + /* * Allow everything during early boot before 'x86_virt_bits' * is initialized. Needed for instruction decoding in early From be66d79189ec8a1006ec6ec302bb27b160b3e6ce Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 2 Feb 2024 18:39:35 +0800 Subject: [PATCH 06/11] selftest/bpf: Test the read of vsyscall page under x86-64 Under x86-64, when using bpf_probe_read_kernel{_str}() or bpf_probe_read{_str}() to read vsyscall page, the read may trigger oops, so add one test case to ensure that the problem is fixed. Beside those four bpf helpers mentioned above, testing the read of vsyscall page by using bpf_probe_read_user{_str} and bpf_copy_from_user{_task}() as well. The test case passes the address of vsyscall page to these six helpers and checks whether the returned values are expected: 1) For bpf_probe_read_kernel{_str}()/bpf_probe_read{_str}(), the expected return value is -ERANGE as shown below: bpf_probe_read_kernel_common copy_from_kernel_nofault // false, return -ERANGE copy_from_kernel_nofault_allowed 2) For bpf_probe_read_user{_str}(), the expected return value is -EFAULT as show below: bpf_probe_read_user_common copy_from_user_nofault // false, return -EFAULT __access_ok 3) For bpf_copy_from_user(), the expected return value is -EFAULT: // return -EFAULT bpf_copy_from_user copy_from_user _copy_from_user // return false access_ok 4) For bpf_copy_from_user_task(), the expected return value is -EFAULT: // return -EFAULT bpf_copy_from_user_task access_process_vm // return 0 vma_lookup() // return 0 expand_stack() The occurrence of oops depends on the availability of CPU SMAP [1] feature and there are three possible configurations of vsyscall page in the boot cmd-line: vsyscall={xonly|none|emulate}, so there are a total of six possible combinations. Under all these combinations, the test case runs successfully. [1]: https://en.wikipedia.org/wiki/Supervisor_Mode_Access_Prevention Acked-by: Yonghong Song Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20240202103935.3154011-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/read_vsyscall.c | 57 +++++++++++++++++++ .../selftests/bpf/progs/read_vsyscall.c | 45 +++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/read_vsyscall.c create mode 100644 tools/testing/selftests/bpf/progs/read_vsyscall.c diff --git a/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c new file mode 100644 index 0000000000000..3405923fe4e65 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2024. Huawei Technologies Co., Ltd */ +#include "test_progs.h" +#include "read_vsyscall.skel.h" + +#if defined(__x86_64__) +/* For VSYSCALL_ADDR */ +#include +#else +/* To prevent build failure on non-x86 arch */ +#define VSYSCALL_ADDR 0UL +#endif + +struct read_ret_desc { + const char *name; + int ret; +} all_read[] = { + { .name = "probe_read_kernel", .ret = -ERANGE }, + { .name = "probe_read_kernel_str", .ret = -ERANGE }, + { .name = "probe_read", .ret = -ERANGE }, + { .name = "probe_read_str", .ret = -ERANGE }, + { .name = "probe_read_user", .ret = -EFAULT }, + { .name = "probe_read_user_str", .ret = -EFAULT }, + { .name = "copy_from_user", .ret = -EFAULT }, + { .name = "copy_from_user_task", .ret = -EFAULT }, +}; + +void test_read_vsyscall(void) +{ + struct read_vsyscall *skel; + unsigned int i; + int err; + +#if !defined(__x86_64__) + test__skip(); + return; +#endif + skel = read_vsyscall__open_and_load(); + if (!ASSERT_OK_PTR(skel, "read_vsyscall open_load")) + return; + + skel->bss->target_pid = getpid(); + err = read_vsyscall__attach(skel); + if (!ASSERT_EQ(err, 0, "read_vsyscall attach")) + goto out; + + /* userspace may don't have vsyscall page due to LEGACY_VSYSCALL_NONE, + * but it doesn't affect the returned error codes. + */ + skel->bss->user_ptr = (void *)VSYSCALL_ADDR; + usleep(1); + + for (i = 0; i < ARRAY_SIZE(all_read); i++) + ASSERT_EQ(skel->bss->read_ret[i], all_read[i].ret, all_read[i].name); +out: + read_vsyscall__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/read_vsyscall.c b/tools/testing/selftests/bpf/progs/read_vsyscall.c new file mode 100644 index 0000000000000..986f96687ae15 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/read_vsyscall.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2024. Huawei Technologies Co., Ltd */ +#include +#include + +#include "bpf_misc.h" + +int target_pid = 0; +void *user_ptr = 0; +int read_ret[8]; + +char _license[] SEC("license") = "GPL"; + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int do_probe_read(void *ctx) +{ + char buf[8]; + + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + read_ret[0] = bpf_probe_read_kernel(buf, sizeof(buf), user_ptr); + read_ret[1] = bpf_probe_read_kernel_str(buf, sizeof(buf), user_ptr); + read_ret[2] = bpf_probe_read(buf, sizeof(buf), user_ptr); + read_ret[3] = bpf_probe_read_str(buf, sizeof(buf), user_ptr); + read_ret[4] = bpf_probe_read_user(buf, sizeof(buf), user_ptr); + read_ret[5] = bpf_probe_read_user_str(buf, sizeof(buf), user_ptr); + + return 0; +} + +SEC("fentry.s/" SYS_PREFIX "sys_nanosleep") +int do_copy_from_user(void *ctx) +{ + char buf[8]; + + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + read_ret[6] = bpf_copy_from_user(buf, sizeof(buf), user_ptr); + read_ret[7] = bpf_copy_from_user_task(buf, sizeof(buf), user_ptr, + bpf_get_current_task_btf(), 0); + + return 0; +} From 0281b919e175bb9c3128bd3872ac2903e9436e3f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 15 Feb 2024 13:12:17 -0800 Subject: [PATCH 07/11] bpf: Fix racing between bpf_timer_cancel_and_free and bpf_timer_cancel The following race is possible between bpf_timer_cancel_and_free and bpf_timer_cancel. It will lead a UAF on the timer->timer. bpf_timer_cancel(); spin_lock(); t = timer->time; spin_unlock(); bpf_timer_cancel_and_free(); spin_lock(); t = timer->timer; timer->timer = NULL; spin_unlock(); hrtimer_cancel(&t->timer); kfree(t); /* UAF on t */ hrtimer_cancel(&t->timer); In bpf_timer_cancel_and_free, this patch frees the timer->timer after a rcu grace period. This requires a rcu_head addition to the "struct bpf_hrtimer". Another kfree(t) happens in bpf_timer_init, this does not need a kfree_rcu because it is still under the spin_lock and timer->timer has not been visible by others yet. In bpf_timer_cancel, rcu_read_lock() is added because this helper can be used in a non rcu critical section context (e.g. from a sleepable bpf prog). Other timer->timer usages in helpers.c have been audited, bpf_timer_cancel() is the only place where timer->timer is used outside of the spin_lock. Another solution considered is to mark a t->flag in bpf_timer_cancel and clear it after hrtimer_cancel() is done. In bpf_timer_cancel_and_free, it busy waits for the flag to be cleared before kfree(t). This patch goes with a straight forward solution and frees timer->timer after a rcu grace period. Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Suggested-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Hou Tao Link: https://lore.kernel.org/bpf/20240215211218.990808-1-martin.lau@linux.dev --- kernel/bpf/helpers.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be72824f32b2c..d19cd863d294e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1101,6 +1101,7 @@ struct bpf_hrtimer { struct bpf_prog *prog; void __rcu *callback_fn; void *value; + struct rcu_head rcu; }; /* the actual struct hidden inside uapi struct bpf_timer */ @@ -1332,6 +1333,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) if (in_nmi()) return -EOPNOTSUPP; + rcu_read_lock(); __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t) { @@ -1353,6 +1355,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); + rcu_read_unlock(); return ret; } @@ -1407,7 +1410,7 @@ void bpf_timer_cancel_and_free(void *val) */ if (this_cpu_read(hrtimer_running) != t) hrtimer_cancel(&t->timer); - kfree(t); + kfree_rcu(t, rcu); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) From 3f00e4a9c96f4488a924aff4e35b77c8eced897e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 15 Feb 2024 13:12:18 -0800 Subject: [PATCH 08/11] selftests/bpf: Test racing between bpf_timer_cancel_and_free and bpf_timer_cancel This selftest is based on a Alexei's test adopted from an internal user to troubleshoot another bug. During this exercise, a separate racing bug was discovered between bpf_timer_cancel_and_free and bpf_timer_cancel. The details can be found in the previous patch. This patch is to add a selftest that can trigger the bug. I can trigger the UAF everytime in my qemu setup with KASAN. The idea is to have multiple user space threads running in a tight loop to exercise both bpf_map_update_elem (which calls into bpf_timer_cancel_and_free) and bpf_timer_cancel. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Hou Tao Link: https://lore.kernel.org/bpf/20240215211218.990808-2-martin.lau@linux.dev --- .../testing/selftests/bpf/prog_tests/timer.c | 35 ++++++++++++++++++- tools/testing/selftests/bpf/progs/timer.c | 34 +++++++++++++++++- 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index 760ad96b4be09..d66687f1ee6a8 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -4,10 +4,29 @@ #include "timer.skel.h" #include "timer_failure.skel.h" +#define NUM_THR 8 + +static void *spin_lock_thread(void *arg) +{ + int i, err, prog_fd = *(int *)arg; + LIBBPF_OPTS(bpf_test_run_opts, topts); + + for (i = 0; i < 10000; i++) { + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err") || + !ASSERT_OK(topts.retval, "test_run_opts retval")) + break; + } + + pthread_exit(arg); +} + static int timer(struct timer *timer_skel) { - int err, prog_fd; + int i, err, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts); + pthread_t thread_id[NUM_THR]; + void *ret; err = timer__attach(timer_skel); if (!ASSERT_OK(err, "timer_attach")) @@ -43,6 +62,20 @@ static int timer(struct timer *timer_skel) /* check that code paths completed */ ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); + prog_fd = bpf_program__fd(timer_skel->progs.race); + for (i = 0; i < NUM_THR; i++) { + err = pthread_create(&thread_id[i], NULL, + &spin_lock_thread, &prog_fd); + if (!ASSERT_OK(err, "pthread_create")) + break; + } + + while (i) { + err = pthread_join(thread_id[--i], &ret); + if (ASSERT_OK(err, "pthread_join")) + ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join"); + } + return 0; } diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index 8b946c8188c65..f615da97df263 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -51,7 +51,8 @@ struct { __uint(max_entries, 1); __type(key, int); __type(value, struct elem); -} abs_timer SEC(".maps"), soft_timer_pinned SEC(".maps"), abs_timer_pinned SEC(".maps"); +} abs_timer SEC(".maps"), soft_timer_pinned SEC(".maps"), abs_timer_pinned SEC(".maps"), + race_array SEC(".maps"); __u64 bss_data; __u64 abs_data; @@ -390,3 +391,34 @@ int BPF_PROG2(test5, int, a) return 0; } + +static int race_timer_callback(void *race_array, int *race_key, struct bpf_timer *timer) +{ + bpf_timer_start(timer, 1000000, 0); + return 0; +} + +SEC("syscall") +int race(void *ctx) +{ + struct bpf_timer *timer; + int err, race_key = 0; + struct elem init; + + __builtin_memset(&init, 0, sizeof(struct elem)); + bpf_map_update_elem(&race_array, &race_key, &init, BPF_ANY); + + timer = bpf_map_lookup_elem(&race_array, &race_key); + if (!timer) + return 1; + + err = bpf_timer_init(timer, &race_array, CLOCK_MONOTONIC); + if (err && err != -EBUSY) + return 1; + + bpf_timer_set_callback(timer, race_timer_callback); + bpf_timer_start(timer, 0, 0); + bpf_timer_cancel(timer); + + return 0; +} From 5f2ae606cb5a90839a9be9d22388c4200f820e75 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 17 Feb 2024 19:41:51 +0800 Subject: [PATCH 09/11] bpf: Fix an issue due to uninitialized bpf_iter_task Failure to initialize it->pos, coupled with the presence of an invalid value in the flags variable, can lead to it->pos referencing an invalid task, potentially resulting in a kernel panic. To mitigate this risk, it's crucial to ensure proper initialization of it->pos to NULL. Fixes: ac8148d957f5 ("bpf: bpf_iter_task_next: use next_task(kit->task) rather than next_task(kit->pos)") Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Oleg Nesterov Link: https://lore.kernel.org/bpf/20240217114152.1623-2-laoar.shao@gmail.com --- kernel/bpf/task_iter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index e5c3500443c6e..ec4e97c61eefe 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -978,6 +978,8 @@ __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != __alignof__(struct bpf_iter_task)); + kit->pos = NULL; + switch (flags) { case BPF_TASK_ITER_ALL_THREADS: case BPF_TASK_ITER_ALL_PROCS: From 5c138a8a4abe152fcbef1ed40a6a4b5727b2991b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 17 Feb 2024 19:41:52 +0800 Subject: [PATCH 10/11] selftests/bpf: Add negtive test cases for task iter Incorporate a test case to assess the handling of invalid flags or task__nullable parameters passed to bpf_iter_task_new(). Prior to the preceding commit, this scenario could potentially trigger a kernel panic. However, with the previous commit, this test case is expected to function correctly. Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240217114152.1623-3-laoar.shao@gmail.com --- tools/testing/selftests/bpf/prog_tests/iters.c | 1 + tools/testing/selftests/bpf/progs/iters_task.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index bf84d4a1d9ae2..3c440370c1f0f 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -193,6 +193,7 @@ static void subtest_task_iters(void) ASSERT_EQ(skel->bss->procs_cnt, 1, "procs_cnt"); ASSERT_EQ(skel->bss->threads_cnt, thread_num + 1, "threads_cnt"); ASSERT_EQ(skel->bss->proc_threads_cnt, thread_num + 1, "proc_threads_cnt"); + ASSERT_EQ(skel->bss->invalid_cnt, 0, "invalid_cnt"); pthread_mutex_unlock(&do_nothing_mutex); for (int i = 0; i < thread_num; i++) ASSERT_OK(pthread_join(thread_ids[i], &ret), "pthread_join"); diff --git a/tools/testing/selftests/bpf/progs/iters_task.c b/tools/testing/selftests/bpf/progs/iters_task.c index c9b4055cd410a..e4d53e40ff208 100644 --- a/tools/testing/selftests/bpf/progs/iters_task.c +++ b/tools/testing/selftests/bpf/progs/iters_task.c @@ -10,7 +10,7 @@ char _license[] SEC("license") = "GPL"; pid_t target_pid; -int procs_cnt, threads_cnt, proc_threads_cnt; +int procs_cnt, threads_cnt, proc_threads_cnt, invalid_cnt; void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; @@ -26,6 +26,16 @@ int iter_task_for_each_sleep(void *ctx) procs_cnt = threads_cnt = proc_threads_cnt = 0; bpf_rcu_read_lock(); + bpf_for_each(task, pos, NULL, ~0U) { + /* Below instructions shouldn't be executed for invalid flags */ + invalid_cnt++; + } + + bpf_for_each(task, pos, NULL, BPF_TASK_ITER_PROC_THREADS) { + /* Below instructions shouldn't be executed for invalid task__nullable */ + invalid_cnt++; + } + bpf_for_each(task, pos, NULL, BPF_TASK_ITER_ALL_PROCS) if (pos->pid == target_pid) procs_cnt++; From 4cd12c6065dfcdeba10f49949bffcf383b3952d8 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Mon, 19 Feb 2024 00:09:33 +0900 Subject: [PATCH 11/11] bpf, sockmap: Fix NULL pointer dereference in sk_psock_verdict_data_ready() syzbot reported the following NULL pointer dereference issue [1]: BUG: kernel NULL pointer dereference, address: 0000000000000000 [...] RIP: 0010:0x0 [...] Call Trace: sk_psock_verdict_data_ready+0x232/0x340 net/core/skmsg.c:1230 unix_stream_sendmsg+0x9b4/0x1230 net/unix/af_unix.c:2293 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2584 ___sys_sendmsg net/socket.c:2638 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2667 do_syscall_64+0xf9/0x240 entry_SYSCALL_64_after_hwframe+0x6f/0x77 If sk_psock_verdict_data_ready() and sk_psock_stop_verdict() are called concurrently, psock->saved_data_ready can be NULL, causing the above issue. This patch fixes this issue by calling the appropriate data ready function using the sk_psock_data_ready() helper and protecting it from concurrency with sk->sk_callback_lock. Fixes: 6df7f764cd3c ("bpf, sockmap: Wake up polling after data copy") Reported-by: syzbot+fd7b34375c1c8ce29c93@syzkaller.appspotmail.com Signed-off-by: Shigeru Yoshida Signed-off-by: Daniel Borkmann Tested-by: syzbot+fd7b34375c1c8ce29c93@syzkaller.appspotmail.com Acked-by: John Fastabend Closes: https://syzkaller.appspot.com/bug?extid=fd7b34375c1c8ce29c93 [1] Link: https://lore.kernel.org/bpf/20240218150933.6004-1-syoshida@redhat.com --- net/core/skmsg.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 93ecfceac1bc4..4d75ef9d24bfa 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1226,8 +1226,11 @@ static void sk_psock_verdict_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); - if (psock) - psock->saved_data_ready(sk); + if (psock) { + read_lock_bh(&sk->sk_callback_lock); + sk_psock_data_ready(sk, psock); + read_unlock_bh(&sk->sk_callback_lock); + } rcu_read_unlock(); } }