From 56f0f84e69c7a7f229dfa524b13b0ceb6ce9b09e Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sat, 15 Jun 2019 22:53:48 +0000 Subject: [PATCH 01/14] bpf: fix the check that forwarding is enabled in bpf_ipv6_fib_lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_ipv6_fib_lookup function should return BPF_FIB_LKUP_RET_FWD_DISABLED when forwarding is disabled for the input device. However instead of checking if forwarding is enabled on the input device, it checked the global net->ipv6.devconf_all->forwarding flag. Change it to behave as expected. Fixes: 87f5fc7e48dd ("bpf: Provide helper to do forwarding lookups in kernel FIB table") Signed-off-by: Anton Protopopov Acked-by: Toke Høiland-Jørgensen Reviewed-by: David Ahern Signed-off-by: Daniel Borkmann --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index f615e42cf4eff..3fdf1b21be366 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4737,7 +4737,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, return -ENODEV; idev = __in6_dev_get_safely(dev); - if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) + if (unlikely(!idev || !idev->cnf.forwarding)) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { From e4f07120210a1794c1f1ae64d209a2fbc7bd2682 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 19 Jun 2019 12:01:05 -0700 Subject: [PATCH 02/14] bpf: fix NULL deref in btf_type_is_resolve_source_only Commit 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") added invocations of btf_type_is_resolve_source_only before btf_type_nosize_or_null which checks for the NULL pointer. Swap the order of btf_type_nosize_or_null and btf_type_is_resolve_source_only to make sure the do the NULL pointer check first. Fixes: 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") Reported-by: syzbot Signed-off-by: Stanislav Fomichev Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cad09858a5f25..546ebee39e2af 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1928,8 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_is_resolve_source_only(index_type) || - btf_type_nosize_or_null(index_type)) { + if (btf_type_nosize_or_null(index_type) || + btf_type_is_resolve_source_only(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1948,8 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_is_resolve_source_only(elem_type) || - btf_type_nosize_or_null(elem_type)) { + if (btf_type_nosize_or_null(elem_type) || + btf_type_is_resolve_source_only(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -2170,8 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_is_resolve_source_only(member_type) || - btf_type_nosize_or_null(member_type)) { + if (btf_type_nosize_or_null(member_type) || + btf_type_is_resolve_source_only(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; From 20f6239d494b2ec7fcc9930eaab3e736a6a9419e Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Thu, 20 Jun 2019 15:58:15 +0900 Subject: [PATCH 03/14] samples/bpf: xdp_redirect, correctly get dummy program id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we terminate xdp_redirect, it ends up with following message: "Program on iface OUT changed, not removing" This results in dummy prog still attached to OUT interface. It is because signal handler checks if the programs are the same that we had attached. But while fetching dummy_prog_id, current code uses prog_fd instead of dummy_prog_fd. This patch passes the correct fd. Fixes: 3b7a8ec2dec3 ("samples/bpf: Check the prog id before exiting") Signed-off-by: Prashant Bhole Acked-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann --- samples/bpf/xdp_redirect_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index e9054c0269ff5..1299e0f61dad3 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -197,7 +197,7 @@ int main(int argc, char **argv) } memset(&info, 0, sizeof(info)); - ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); if (ret) { printf("can't get prog info - %s\n", strerror(errno)); return ret; From 0eb84fa6e6163fd5e238661b376c959e8cdd549a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 24 Jun 2019 07:24:55 +0200 Subject: [PATCH 04/14] MAINTAINERS: add reviewer to maintainers entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jonathan Lemon has volunteered as an official AF_XDP reviewer. Thank you, Jonathan! Signed-off-by: Björn Töpel Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 36a84614d6c3b..df039d0c245e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17265,6 +17265,7 @@ N: xdp XDP SOCKETS (AF_XDP) M: Björn Töpel M: Magnus Karlsson +R: Jonathan Lemon L: netdev@vger.kernel.org L: bpf@vger.kernel.org S: Maintained From f7019b7b0ad14bde732b8953161994edfc384953 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 25 Jun 2019 11:23:52 -0700 Subject: [PATCH 05/14] xsk: Properly terminate assignment in xskq_produce_flush_desc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang warns: In file included from net/xdp/xsk_queue.c:10: net/xdp/xsk_queue.h:292:2: warning: expression result unused [-Wunused-value] WRITE_ONCE(q->ring->producer, q->prod_tail); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/compiler.h:284:6: note: expanded from macro 'WRITE_ONCE' __u.__val; \ ~~~ ^~~~~ 1 warning generated. The q->prod_tail assignment has a comma at the end, not a semi-colon. Fix that so clang no longer warns and everything works as expected. Fixes: c497176cb2e4 ("xsk: add Rx receive functions and poll support") Link: https://github.com/ClangBuiltLinux/linux/issues/544 Signed-off-by: Nathan Chancellor Acked-by: Nick Desaulniers Acked-by: Jonathan Lemon Acked-by: Björn Töpel Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- net/xdp/xsk_queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 88b9ae24658d1..cba4a640d5e84 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -288,7 +288,7 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) /* Order producer and data */ smp_wmb(); /* B, matches C */ - q->prod_tail = q->prod_head, + q->prod_tail = q->prod_head; WRITE_ONCE(q->ring->producer, q->prod_tail); } From 6c6874f401e5a0caab3b6a0663169e1fb5e930bb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jun 2019 09:56:31 -0700 Subject: [PATCH 06/14] tools: bpftool: use correct argument in cgroup errors cgroup code tries to use argv[0] as the cgroup path, but if it fails uses argv[1] to report errors. Fixes: 5ccda64d38cc ("bpftool: implement cgroup bpf operations") Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 73ec8ea33fb43..a13fb7265d1a1 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -168,7 +168,7 @@ static int do_show(int argc, char **argv) cgroup_fd = open(argv[0], O_RDONLY); if (cgroup_fd < 0) { - p_err("can't open cgroup %s", argv[1]); + p_err("can't open cgroup %s", argv[0]); goto exit; } @@ -356,7 +356,7 @@ static int do_attach(int argc, char **argv) cgroup_fd = open(argv[0], O_RDONLY); if (cgroup_fd < 0) { - p_err("can't open cgroup %s", argv[1]); + p_err("can't open cgroup %s", argv[0]); goto exit; } @@ -414,7 +414,7 @@ static int do_detach(int argc, char **argv) cgroup_fd = open(argv[0], O_RDONLY); if (cgroup_fd < 0) { - p_err("can't open cgroup %s", argv[1]); + p_err("can't open cgroup %s", argv[0]); goto exit; } From 75672dda27bd00109a84cd975c17949ad9c45663 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Tue, 25 Jun 2019 17:41:50 +0100 Subject: [PATCH 07/14] bpf: fix BPF_ALU32 | BPF_ARSH on BE arches Yauheni reported the following code do not work correctly on BE arches: ALU_ARSH_X: DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); CONT; ALU_ARSH_K: DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); CONT; and are causing failure of test_verifier test 'arsh32 on imm 2' on BE arches. The code is taking address and interpreting memory directly, so is not endianness neutral. We should instead perform standard C type casting on the variable. A u64 to s32 conversion will drop the high 32-bit and reserve the low 32-bit as signed integer, this is all we want. Fixes: 2dc6b100f928 ("bpf: interpreter support BPF_ALU | BPF_ARSH") Reported-by: Yauheni Kaliuta Reviewed-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Jiong Wang Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 080e2bb644ccd..f2148db91439e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1364,10 +1364,10 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) insn++; CONT; ALU_ARSH_X: - DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); + DST = (u64) (u32) (((s32) DST) >> SRC); CONT; ALU_ARSH_K: - DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); + DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; From 0472301a28f6cf53a6bc5783e48a2d0bbff4682f Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Fri, 28 Jun 2019 07:08:45 +0300 Subject: [PATCH 08/14] bpf: fix uapi bpf_prog_info fields alignment Merge commit 1c8c5a9d38f60 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next") undid the fix from commit 36f9814a494 ("bpf: fix uapi hole for 32 bit compat applications") by taking the gpl_compatible 1-bit field definition from commit b85fab0e67b162 ("bpf: Add gpl_compatible flag to struct bpf_prog_info") as is. That breaks architectures with 16-bit alignment like m68k. Add 31-bit pad after gpl_compatible to restore alignment of following fields. Thanks to Dmitry V. Levin his analysis of this bug history. Signed-off-by: Baruch Siach Acked-by: Song Liu Cc: Jiri Olsa Cc: Daniel Borkmann Cc: Geert Uytterhoeven Cc: Linus Torvalds Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + tools/include/uapi/linux/bpf.h | 1 + 2 files changed, 2 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a8b823c30b434..29a5bc3d5c666 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3143,6 +3143,7 @@ struct bpf_prog_info { char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; __u32 gpl_compatible:1; + __u32 :31; /* alignment pad */ __u64 netns_dev; __u64 netns_ino; __u32 nr_jited_ksyms; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a8b823c30b434..29a5bc3d5c666 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3143,6 +3143,7 @@ struct bpf_prog_info { char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; __u32 gpl_compatible:1; + __u32 :31; /* alignment pad */ __u64 netns_dev; __u64 netns_ino; __u32 nr_jited_ksyms; From 68a8357ec15bdce55266e9fba8b8b3b8143fa7d2 Mon Sep 17 00:00:00 2001 From: Luke Nelson Date: Fri, 28 Jun 2019 22:57:49 -0700 Subject: [PATCH 09/14] bpf, x32: Fix bug with ALU64 {LSH, RSH, ARSH} BPF_X shift by 0 The current x32 BPF JIT for shift operations is not correct when the shift amount in a register is 0. The expected behavior is a no-op, whereas the current implementation changes bits in the destination register. The following example demonstrates the bug. The expected result of this program is 1, but the current JITed code returns 2. r0 = 1 r1 = 1 r2 = 0 r1 <<= r2 if r1 == 1 goto end r0 = 2 end: exit The bug is caused by an incorrect assumption by the JIT that a shift by 32 clear the register. On x32 however, shifts use the lower 5 bits of the source, making a shift by 32 equivalent to a shift by 0. This patch fixes the bug using double-precision shifts, which also simplifies the code. Fixes: 03f5781be2c7 ("bpf, x86_32: add eBPF JIT compiler for ia32") Co-developed-by: Xi Wang Signed-off-by: Xi Wang Signed-off-by: Luke Nelson Signed-off-by: Daniel Borkmann --- arch/x86/net/bpf_jit_comp32.c | 221 ++++------------------------------ 1 file changed, 23 insertions(+), 198 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index b29e82f190c76..f34ef513f4f9a 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -724,9 +724,6 @@ static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[], { u8 *prog = *pprog; int cnt = 0; - static int jmp_label1 = -1; - static int jmp_label2 = -1; - static int jmp_label3 = -1; u8 dreg_lo = dstk ? IA32_EAX : dst_lo; u8 dreg_hi = dstk ? IA32_EDX : dst_hi; @@ -745,79 +742,23 @@ static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[], /* mov ecx,src_lo */ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX)); - /* cmp ecx,32 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); - /* Jumps when >= 32 */ - if (is_imm8(jmp_label(jmp_label1, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6)); - - /* < 32 */ - /* shl dreg_hi,cl */ - EMIT2(0xD3, add_1reg(0xE0, dreg_hi)); - /* mov ebx,dreg_lo */ - EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX)); + /* shld dreg_hi,dreg_lo,cl */ + EMIT3(0x0F, 0xA5, add_2reg(0xC0, dreg_hi, dreg_lo)); /* shl dreg_lo,cl */ EMIT2(0xD3, add_1reg(0xE0, dreg_lo)); - /* IA32_ECX = -IA32_ECX + 32 */ - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shr ebx,cl */ - EMIT2(0xD3, add_1reg(0xE8, IA32_EBX)); - /* or dreg_hi,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX)); - - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 32 */ - if (jmp_label1 == -1) - jmp_label1 = cnt; + /* if ecx >= 32, mov dreg_lo into dreg_hi and clear dreg_lo */ - /* cmp ecx,64 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64); - /* Jumps when >= 64 */ - if (is_imm8(jmp_label(jmp_label2, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label2, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6)); + /* cmp ecx,32 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); + /* skip the next two instructions (4 bytes) when < 32 */ + EMIT2(IA32_JB, 4); - /* >= 32 && < 64 */ - /* sub ecx,32 */ - EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32); - /* shl dreg_lo,cl */ - EMIT2(0xD3, add_1reg(0xE0, dreg_lo)); /* mov dreg_hi,dreg_lo */ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo)); - /* xor dreg_lo,dreg_lo */ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 64 */ - if (jmp_label2 == -1) - jmp_label2 = cnt; - /* xor dreg_lo,dreg_lo */ - EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); - /* xor dreg_hi,dreg_hi */ - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); - - if (jmp_label3 == -1) - jmp_label3 = cnt; - if (dstk) { /* mov dword ptr [ebp+off],dreg_lo */ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), @@ -836,9 +777,6 @@ static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[], { u8 *prog = *pprog; int cnt = 0; - static int jmp_label1 = -1; - static int jmp_label2 = -1; - static int jmp_label3 = -1; u8 dreg_lo = dstk ? IA32_EAX : dst_lo; u8 dreg_hi = dstk ? IA32_EDX : dst_hi; @@ -857,78 +795,22 @@ static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[], /* mov ecx,src_lo */ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX)); - /* cmp ecx,32 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); - /* Jumps when >= 32 */ - if (is_imm8(jmp_label(jmp_label1, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6)); - - /* < 32 */ - /* lshr dreg_lo,cl */ - EMIT2(0xD3, add_1reg(0xE8, dreg_lo)); - /* mov ebx,dreg_hi */ - EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); - /* ashr dreg_hi,cl */ + /* shrd dreg_lo,dreg_hi,cl */ + EMIT3(0x0F, 0xAD, add_2reg(0xC0, dreg_lo, dreg_hi)); + /* sar dreg_hi,cl */ EMIT2(0xD3, add_1reg(0xF8, dreg_hi)); - /* IA32_ECX = -IA32_ECX + 32 */ - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shl ebx,cl */ - EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); - /* or dreg_lo,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); - - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 32 */ - if (jmp_label1 == -1) - jmp_label1 = cnt; + /* if ecx >= 32, mov dreg_hi to dreg_lo and set/clear dreg_hi depending on sign */ - /* cmp ecx,64 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64); - /* Jumps when >= 64 */ - if (is_imm8(jmp_label(jmp_label2, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label2, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6)); + /* cmp ecx,32 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); + /* skip the next two instructions (5 bytes) when < 32 */ + EMIT2(IA32_JB, 5); - /* >= 32 && < 64 */ - /* sub ecx,32 */ - EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32); - /* ashr dreg_hi,cl */ - EMIT2(0xD3, add_1reg(0xF8, dreg_hi)); /* mov dreg_lo,dreg_hi */ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); - - /* ashr dreg_hi,imm8 */ - EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31); - - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 64 */ - if (jmp_label2 == -1) - jmp_label2 = cnt; - /* ashr dreg_hi,imm8 */ + /* sar dreg_hi,31 */ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31); - /* mov dreg_lo,dreg_hi */ - EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); - - if (jmp_label3 == -1) - jmp_label3 = cnt; if (dstk) { /* mov dword ptr [ebp+off],dreg_lo */ @@ -948,9 +830,6 @@ static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk, { u8 *prog = *pprog; int cnt = 0; - static int jmp_label1 = -1; - static int jmp_label2 = -1; - static int jmp_label3 = -1; u8 dreg_lo = dstk ? IA32_EAX : dst_lo; u8 dreg_hi = dstk ? IA32_EDX : dst_hi; @@ -969,77 +848,23 @@ static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk, /* mov ecx,src_lo */ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX)); - /* cmp ecx,32 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); - /* Jumps when >= 32 */ - if (is_imm8(jmp_label(jmp_label1, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6)); - - /* < 32 */ - /* lshr dreg_lo,cl */ - EMIT2(0xD3, add_1reg(0xE8, dreg_lo)); - /* mov ebx,dreg_hi */ - EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* shrd dreg_lo,dreg_hi,cl */ + EMIT3(0x0F, 0xAD, add_2reg(0xC0, dreg_lo, dreg_hi)); /* shr dreg_hi,cl */ EMIT2(0xD3, add_1reg(0xE8, dreg_hi)); - /* IA32_ECX = -IA32_ECX + 32 */ - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shl ebx,cl */ - EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); - /* or dreg_lo,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); + /* if ecx >= 32, mov dreg_hi to dreg_lo and clear dreg_hi */ - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 32 */ - if (jmp_label1 == -1) - jmp_label1 = cnt; - /* cmp ecx,64 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64); - /* Jumps when >= 64 */ - if (is_imm8(jmp_label(jmp_label2, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label2, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6)); + /* cmp ecx,32 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); + /* skip the next two instructions (4 bytes) when < 32 */ + EMIT2(IA32_JB, 4); - /* >= 32 && < 64 */ - /* sub ecx,32 */ - EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32); - /* shr dreg_hi,cl */ - EMIT2(0xD3, add_1reg(0xE8, dreg_hi)); /* mov dreg_lo,dreg_hi */ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); /* xor dreg_hi,dreg_hi */ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 64 */ - if (jmp_label2 == -1) - jmp_label2 = cnt; - /* xor dreg_lo,dreg_lo */ - EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); - /* xor dreg_hi,dreg_hi */ - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); - - if (jmp_label3 == -1) - jmp_label3 = cnt; - if (dstk) { /* mov dword ptr [ebp+off],dreg_lo */ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), From 6fa632e719eec4d1b1ebf3ddc0b2d667997b057b Mon Sep 17 00:00:00 2001 From: Luke Nelson Date: Fri, 28 Jun 2019 22:57:50 -0700 Subject: [PATCH 10/14] bpf, x32: Fix bug with ALU64 {LSH, RSH, ARSH} BPF_K shift by 0 The current x32 BPF JIT does not correctly compile shift operations when the immediate shift amount is 0. The expected behavior is for this to be a no-op. The following program demonstrates the bug. The expexceted result is 1, but the current JITed code returns 2. r0 = 1 r1 = 1 r1 <<= 0 if r1 == 1 goto end r0 = 2 end: exit This patch simplifies the code and fixes the bug. Fixes: 03f5781be2c7 ("bpf, x86_32: add eBPF JIT compiler for ia32") Co-developed-by: Xi Wang Signed-off-by: Xi Wang Signed-off-by: Luke Nelson Signed-off-by: Daniel Borkmann --- arch/x86/net/bpf_jit_comp32.c | 63 ++++------------------------------- 1 file changed, 6 insertions(+), 57 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index f34ef513f4f9a..1d12d2174085c 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -894,27 +894,10 @@ static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val, } /* Do LSH operation */ if (val < 32) { - /* shl dreg_hi,imm8 */ - EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val); - /* mov ebx,dreg_lo */ - EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX)); + /* shld dreg_hi,dreg_lo,imm8 */ + EMIT4(0x0F, 0xA4, add_2reg(0xC0, dreg_hi, dreg_lo), val); /* shl dreg_lo,imm8 */ EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val); - - /* IA32_ECX = 32 - val */ - /* mov ecx,val */ - EMIT2(0xB1, val); - /* movzx ecx,ecx */ - EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shr ebx,cl */ - EMIT2(0xD3, add_1reg(0xE8, IA32_EBX)); - /* or dreg_hi,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX)); } else if (val >= 32 && val < 64) { u32 value = val - 32; @@ -960,27 +943,10 @@ static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val, /* Do RSH operation */ if (val < 32) { - /* shr dreg_lo,imm8 */ - EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val); - /* mov ebx,dreg_hi */ - EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* shrd dreg_lo,dreg_hi,imm8 */ + EMIT4(0x0F, 0xAC, add_2reg(0xC0, dreg_lo, dreg_hi), val); /* shr dreg_hi,imm8 */ EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val); - - /* IA32_ECX = 32 - val */ - /* mov ecx,val */ - EMIT2(0xB1, val); - /* movzx ecx,ecx */ - EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shl ebx,cl */ - EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); - /* or dreg_lo,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); } else if (val >= 32 && val < 64) { u32 value = val - 32; @@ -1025,27 +991,10 @@ static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val, } /* Do RSH operation */ if (val < 32) { - /* shr dreg_lo,imm8 */ - EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val); - /* mov ebx,dreg_hi */ - EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* shrd dreg_lo,dreg_hi,imm8 */ + EMIT4(0x0F, 0xAC, add_2reg(0xC0, dreg_lo, dreg_hi), val); /* ashr dreg_hi,imm8 */ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val); - - /* IA32_ECX = 32 - val */ - /* mov ecx,val */ - EMIT2(0xB1, val); - /* movzx ecx,ecx */ - EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shl ebx,cl */ - EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); - /* or dreg_lo,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); } else if (val >= 32 && val < 64) { u32 value = val - 32; From ac8786c72eba67dfc8ae751a75c586289a1b9b1b Mon Sep 17 00:00:00 2001 From: Luke Nelson Date: Fri, 28 Jun 2019 22:57:51 -0700 Subject: [PATCH 11/14] selftests: bpf: add tests for shifts by zero There are currently no tests for ALU64 shift operations when the shift amount is 0. This adds 6 new tests to make sure they are equivalent to a no-op. The x32 JIT had such bugs that could have been caught by these tests. Cc: Xi Wang Signed-off-by: Luke Nelson Signed-off-by: Daniel Borkmann --- .../selftests/bpf/verifier/basic_instr.c | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/basic_instr.c b/tools/testing/selftests/bpf/verifier/basic_instr.c index ed91a7b9a4566..071dbc889e8c6 100644 --- a/tools/testing/selftests/bpf/verifier/basic_instr.c +++ b/tools/testing/selftests/bpf/verifier/basic_instr.c @@ -90,6 +90,91 @@ }, .result = ACCEPT, }, +{ + "lsh64 by 0 imm", + .insns = { + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_1, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 1, +}, +{ + "rsh64 by 0 imm", + .insns = { + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_1, 0x100000000LL), + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 0), + BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 1, +}, +{ + "arsh64 by 0 imm", + .insns = { + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_1, 0x100000000LL), + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 0), + BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 1, +}, +{ + "lsh64 by 0 reg", + .insns = { + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_1, 1), + BPF_LD_IMM64(BPF_REG_2, 0), + BPF_ALU64_REG(BPF_LSH, BPF_REG_1, BPF_REG_2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 1, +}, +{ + "rsh64 by 0 reg", + .insns = { + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_1, 0x100000000LL), + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1), + BPF_LD_IMM64(BPF_REG_3, 0), + BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_3), + BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 1, +}, +{ + "arsh64 by 0 reg", + .insns = { + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_1, 0x100000000LL), + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1), + BPF_LD_IMM64(BPF_REG_3, 0), + BPF_ALU64_REG(BPF_ARSH, BPF_REG_1, BPF_REG_3), + BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 1, +}, { "invalid 64-bit BPF_END", .insns = { From 11aca65ec4db09527d3e9b6b41a0615b7da4386b Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 2 Jul 2019 19:40:31 +0200 Subject: [PATCH 12/14] selftests: bpf: fix inlines in test_lwt_seg6local Selftests are reporting this failure in test_lwt_seg6local.sh: + ip netns exec ns2 ip -6 route add fb00::6 encap bpf in obj test_lwt_seg6local.o sec encap_srh dev veth2 Error fetching program/map! Failed to parse eBPF program: Operation not permitted The problem is __attribute__((always_inline)) alone is not enough to prevent clang from inserting those functions in .text. In that case, .text is not marked as relocateable. See the output of objdump -h test_lwt_seg6local.o: Idx Name Size VMA LMA File off Algn 0 .text 00003530 0000000000000000 0000000000000000 00000040 2**3 CONTENTS, ALLOC, LOAD, READONLY, CODE This causes the iproute bpf loader to fail in bpf_fetch_prog_sec: bpf_has_call_data returns true but bpf_fetch_prog_relo fails as there's no relocateable .text section in the file. To fix this, convert to 'static __always_inline'. v2: Use 'static __always_inline' instead of 'static inline __attribute__((always_inline))' Fixes: c99a84eac026 ("selftests/bpf: test for seg6local End.BPF action") Signed-off-by: Jiri Benc Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- .../testing/selftests/bpf/progs/test_lwt_seg6local.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c index 0575751bc1bc4..e2f6ed0a583de 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c @@ -61,7 +61,7 @@ struct sr6_tlv_t { unsigned char value[0]; } BPF_PACKET_HEADER; -__attribute__((always_inline)) struct ip6_srh_t *get_srh(struct __sk_buff *skb) +static __always_inline struct ip6_srh_t *get_srh(struct __sk_buff *skb) { void *cursor, *data_end; struct ip6_srh_t *srh; @@ -95,7 +95,7 @@ __attribute__((always_inline)) struct ip6_srh_t *get_srh(struct __sk_buff *skb) return srh; } -__attribute__((always_inline)) +static __always_inline int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad, uint32_t old_pad, uint32_t pad_off) { @@ -125,7 +125,7 @@ int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad, return 0; } -__attribute__((always_inline)) +static __always_inline int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t *tlv_off, uint32_t *pad_size, uint32_t *pad_off) @@ -184,7 +184,7 @@ int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh, return 0; } -__attribute__((always_inline)) +static __always_inline int add_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off, struct sr6_tlv_t *itlv, uint8_t tlv_size) { @@ -228,7 +228,7 @@ int add_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off, return update_tlv_pad(skb, new_pad, pad_size, pad_off); } -__attribute__((always_inline)) +static __always_inline int delete_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off) { @@ -266,7 +266,7 @@ int delete_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, return update_tlv_pad(skb, new_pad, pad_size, pad_off); } -__attribute__((always_inline)) +static __always_inline int has_egr_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh) { int tlv_offset = sizeof(struct ip6_t) + sizeof(struct ip6_srh_t) + From 162c820ed8965bf94d2685f97388aea5aee9e258 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 28 Jun 2019 11:04:06 +0300 Subject: [PATCH 13/14] xdp: hold device for umem regardless of zero-copy mode Device pointer stored in umem regardless of zero-copy mode, so we heed to hold the device in all cases. Fixes: c9b47cc1fabc ("xsk: fix bug when trying to use both copy and zero-copy on one queue id") Signed-off-by: Ilya Maximets Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9c6de4f114f84..267b82a4cbcf8 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -105,6 +105,9 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, umem->dev = dev; umem->queue_id = queue_id; + + dev_hold(dev); + if (force_copy) /* For copy-mode, we are done. */ goto out_rtnl_unlock; @@ -124,7 +127,6 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, goto err_unreg_umem; rtnl_unlock(); - dev_hold(dev); umem->zc = true; return 0; @@ -163,10 +165,9 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) xdp_clear_umem_at_qid(umem->dev, umem->queue_id); rtnl_unlock(); - if (umem->zc) { - dev_put(umem->dev); - umem->zc = false; - } + dev_put(umem->dev); + umem->dev = NULL; + umem->zc = false; } static void xdp_umem_unpin_pages(struct xdp_umem *umem) From 455302d1c9ae9318660aaeb9748a01ff414c9741 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 28 Jun 2019 11:04:07 +0300 Subject: [PATCH 14/14] xdp: fix hang while unregistering device bound to xdp socket Device that bound to XDP socket will not have zero refcount until the userspace application will not close it. This leads to hang inside 'netdev_wait_allrefs()' if device unregistering requested: # ip link del p1 < hang on recvmsg on netlink socket > # ps -x | grep ip 5126 pts/0 D+ 0:00 ip link del p1 # journalctl -b Jun 05 07:19:16 kernel: unregister_netdevice: waiting for p1 to become free. Usage count = 1 Jun 05 07:19:27 kernel: unregister_netdevice: waiting for p1 to become free. Usage count = 1 ... Fix that by implementing NETDEV_UNREGISTER event notification handler to properly clean up all the resources and unref device. This should also allow socket killing via ss(8) utility. Fixes: 965a99098443 ("xsk: add support for bind for Rx") Signed-off-by: Ilya Maximets Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 5 +++ net/xdp/xdp_umem.c | 10 ++--- net/xdp/xdp_umem.h | 1 + net/xdp/xsk.c | 87 ++++++++++++++++++++++++++++++++++++------ 4 files changed, 87 insertions(+), 16 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d074b6d60f8af..7da155164947d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -61,6 +61,11 @@ struct xdp_sock { struct xsk_queue *tx ____cacheline_aligned_in_smp; struct list_head list; bool zc; + enum { + XSK_READY = 0, + XSK_BOUND, + XSK_UNBOUND, + } state; /* Protects multiple processes in the control path */ struct mutex mutex; /* Mutual exclusion of NAPI TX thread and sendmsg error paths diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 267b82a4cbcf8..20c91f02d3d80 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -140,11 +140,13 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, return err; } -static void xdp_umem_clear_dev(struct xdp_umem *umem) +void xdp_umem_clear_dev(struct xdp_umem *umem) { struct netdev_bpf bpf; int err; + ASSERT_RTNL(); + if (!umem->dev) return; @@ -153,17 +155,13 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) bpf.xsk.umem = NULL; bpf.xsk.queue_id = umem->queue_id; - rtnl_lock(); err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); - rtnl_unlock(); if (err) WARN(1, "failed to disable umem!\n"); } - rtnl_lock(); xdp_clear_umem_at_qid(umem->dev, umem->queue_id); - rtnl_unlock(); dev_put(umem->dev); umem->dev = NULL; @@ -195,7 +193,9 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem) static void xdp_umem_release(struct xdp_umem *umem) { + rtnl_lock(); xdp_umem_clear_dev(umem); + rtnl_unlock(); ida_simple_remove(&umem_ida, umem->id); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 27603227601be..a63a9fb251f50 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -10,6 +10,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u16 queue_id, u16 flags); +void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a14e8864e4fa4..f53a6ef7c155d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -335,6 +335,22 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, return 0; } +static void xsk_unbind_dev(struct xdp_sock *xs) +{ + struct net_device *dev = xs->dev; + + if (!dev || xs->state != XSK_BOUND) + return; + + xs->state = XSK_UNBOUND; + + /* Wait for driver to stop using the xdp socket. */ + xdp_del_sk_umem(xs->umem, xs); + xs->dev = NULL; + synchronize_net(); + dev_put(dev); +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -354,15 +370,7 @@ static int xsk_release(struct socket *sock) sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); - if (xs->dev) { - struct net_device *dev = xs->dev; - - /* Wait for driver to stop using the xdp socket. */ - xdp_del_sk_umem(xs->umem, xs); - xs->dev = NULL; - synchronize_net(); - dev_put(dev); - } + xsk_unbind_dev(xs); xskq_destroy(xs->rx); xskq_destroy(xs->tx); @@ -412,7 +420,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; mutex_lock(&xs->mutex); - if (xs->dev) { + if (xs->state != XSK_READY) { err = -EBUSY; goto out_release; } @@ -492,6 +500,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) out_unlock: if (err) dev_put(dev); + else + xs->state = XSK_BOUND; out_release: mutex_unlock(&xs->mutex); return err; @@ -520,6 +530,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); + if (xs->state != XSK_READY) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); mutex_unlock(&xs->mutex); @@ -534,7 +548,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); - if (xs->umem) { + if (xs->state != XSK_READY || xs->umem) { mutex_unlock(&xs->mutex); return -EBUSY; } @@ -561,6 +575,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); + if (xs->state != XSK_READY) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } if (!xs->umem) { mutex_unlock(&xs->mutex); return -EINVAL; @@ -662,6 +680,9 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long pfn; struct page *qpg; + if (xs->state != XSK_READY) + return -EBUSY; + if (offset == XDP_PGOFF_RX_RING) { q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { @@ -693,6 +714,38 @@ static int xsk_mmap(struct file *file, struct socket *sock, size, vma->vm_page_prot); } +static int xsk_notifier(struct notifier_block *this, + unsigned long msg, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct sock *sk; + + switch (msg) { + case NETDEV_UNREGISTER: + mutex_lock(&net->xdp.lock); + sk_for_each(sk, &net->xdp.list) { + struct xdp_sock *xs = xdp_sk(sk); + + mutex_lock(&xs->mutex); + if (xs->dev == dev) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + xsk_unbind_dev(xs); + + /* Clear device references in umem. */ + xdp_umem_clear_dev(xs->umem); + } + mutex_unlock(&xs->mutex); + } + mutex_unlock(&net->xdp.lock); + break; + } + return NOTIFY_DONE; +} + static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, @@ -764,6 +817,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); + xs->state = XSK_READY; mutex_init(&xs->mutex); spin_lock_init(&xs->tx_completion_lock); @@ -784,6 +838,10 @@ static const struct net_proto_family xsk_family_ops = { .owner = THIS_MODULE, }; +static struct notifier_block xsk_netdev_notifier = { + .notifier_call = xsk_notifier, +}; + static int __net_init xsk_net_init(struct net *net) { mutex_init(&net->xdp.lock); @@ -816,8 +874,15 @@ static int __init xsk_init(void) err = register_pernet_subsys(&xsk_net_ops); if (err) goto out_sk; + + err = register_netdevice_notifier(&xsk_netdev_notifier); + if (err) + goto out_pernet; + return 0; +out_pernet: + unregister_pernet_subsys(&xsk_net_ops); out_sk: sock_unregister(PF_XDP); out_proto: