From 752bcf80f5549c9901b2e8bc77b2138de55b1026 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 18 Jan 2019 13:58:17 +0100
Subject: [PATCH 01/13] bpftool: Fix prog dump by tag

Lance reported an issue with bpftool not being able to
dump program if there are more programs loaded and you
want to dump any but the first program, like:

  # bpftool prog
  28: kprobe  name trace_req_start  tag 1dfc28ba8b3dd597  gpl
  	loaded_at 2019-01-18T17:02:40+1100  uid 0
  	xlated 112B  jited 109B  memlock 4096B  map_ids 13
  29: kprobe  name trace_req_compl  tag 5b6a5ecc6030a683  gpl
  	loaded_at 2019-01-18T17:02:40+1100  uid 0
  	xlated 928B  jited 575B  memlock 4096B  map_ids 13,14
  #  bpftool prog dum jited tag 1dfc28ba8b3dd597
   0:	push   %rbp
   1:	mov    %rsp,%rbp
  ...

  #  bpftool prog dum jited tag 5b6a5ecc6030a683
  Error: can't get prog info (29): Bad address

The problem is in the prog_fd_by_tag function not cleaning
the struct bpf_prog_info before another request, so the
previous program length is still in there and kernel assumes
it needs to dump the program, which fails because there's no
user pointer set.

Moving the struct bpf_prog_info declaration into the loop,
so it gets cleaned before each query.

Fixes: 71bb428fe2c1 ("tools: bpf: add bpftool")
Reported-by: Lance Digby <ldigby@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/prog.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 2d1bb7d6ff511..b54ed82b9589d 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -78,13 +78,14 @@ static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
 
 static int prog_fd_by_tag(unsigned char *tag)
 {
-	struct bpf_prog_info info = {};
-	__u32 len = sizeof(info);
 	unsigned int id = 0;
 	int err;
 	int fd;
 
 	while (true) {
+		struct bpf_prog_info info = {};
+		__u32 len = sizeof(info);
+
 		err = bpf_prog_get_next_id(id, &id);
 		if (err) {
 			p_err("%s", strerror(errno));

From b0ca5ecb8e2279d706261f525f1bd0ba9e3fe800 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 21 Jan 2019 12:36:12 +0100
Subject: [PATCH 02/13] bpftool: fix percpu maps updating

When updating a percpu map, bpftool currently copies the provided
value only into the first per CPU copy of the specified value,
all others instances are left zeroed.

This change explicitly copies the user-provided bytes to all the
per CPU instances, keeping the sub-command syntax unchanged.

v2 -> v3:
 - drop unused argument, as per Quentin's suggestion
v1 -> v2:
 - rename the helper as per Quentin's suggestion

Fixes: 71bb428fe2c1 ("tools: bpf: add bpftool")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/map.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 2037e3dc864ba..29a3468c6cf60 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -347,6 +347,20 @@ static char **parse_bytes(char **argv, const char *name, unsigned char *val,
 	return argv + i;
 }
 
+/* on per cpu maps we must copy the provided value on all value instances */
+static void fill_per_cpu_value(struct bpf_map_info *info, void *value)
+{
+	unsigned int i, n, step;
+
+	if (!map_is_per_cpu(info->type))
+		return;
+
+	n = get_possible_cpus();
+	step = round_up(info->value_size, 8);
+	for (i = 1; i < n; i++)
+		memcpy(value + i * step, value, info->value_size);
+}
+
 static int parse_elem(char **argv, struct bpf_map_info *info,
 		      void *key, void *value, __u32 key_size, __u32 value_size,
 		      __u32 *flags, __u32 **value_fd)
@@ -426,6 +440,8 @@ static int parse_elem(char **argv, struct bpf_map_info *info,
 			argv = parse_bytes(argv, "value", value, value_size);
 			if (!argv)
 				return -1;
+
+			fill_per_cpu_value(info, value);
 		}
 
 		return parse_elem(argv, info, key, NULL, key_size, value_size,

From c9e4576743eeda8d24dedc164d65b78877f9a98c Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 23 Jan 2019 12:37:19 +0800
Subject: [PATCH 03/13] bpf: sock recvbuff must be limited by rmem_max in
 bpf_setsockopt()

When sock recvbuff is set by bpf_setsockopt(), the value must by
limited by rmem_max. It is the same with sendbuff.

Fixes: 8c4b4c7e9ff0 ("bpf: Add setsockopt helper function to bpf")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 7559d6835ecb7..7a54dc11ac2d3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4112,10 +4112,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		/* Only some socketops are supported */
 		switch (optname) {
 		case SO_RCVBUF:
+			val = min_t(u32, val, sysctl_rmem_max);
 			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 			sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 			break;
 		case SO_SNDBUF:
+			val = min_t(u32, val, sysctl_wmem_max);
 			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 			sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 			break;

From 8c79b35693380d856dcbbc21629682a90f26ca62 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 28 Jan 2019 10:01:21 -0800
Subject: [PATCH 04/13] tools: bpftool: fix crash with un-owned prog arrays

Prog arrays don't have 'owner_prog_type' and 'owner_jited'
fields in their fdinfo when they are created.  Those fields
are set and reported when first program is checked for
compatibility by bpf_prog_array_compatible().

This means that bpftool cannot expect the fields to always
be there.  Currently trying to show maps on a system with
an un-owned prog array leads to a crash:

$ bpftool map show
389: prog_array  name tail_call_map  flags 0x0
Error: key 'owner_prog_type' not found in fdinfo
Error: key 'owner_jited' not found in fdinfo
       key 4B  value 4B  max_entries 4  memlock 4096B
       Segmentation fault (core dumped)

We pass a NULL pointer to atoi().

Remove the assumption that fdinfo keys are always present.
Add missing validations and remove the p_err() calls which
may lead to broken JSON output as caller will not propagate
the failure.

Fixes: 99a44bef5870 ("tools: bpftool: add owner_prog_type and owner_jited to bpftool output")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/common.c |  6 +-----
 tools/bpf/bpftool/map.c    | 17 ++++++++---------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 897483457bf03..f7261fad45c19 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -297,10 +297,8 @@ char *get_fdinfo(int fd, const char *key)
 	snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd);
 
 	fdi = fopen(path, "r");
-	if (!fdi) {
-		p_err("can't open fdinfo: %s", strerror(errno));
+	if (!fdi)
 		return NULL;
-	}
 
 	while ((n = getline(&line, &line_n, fdi)) > 0) {
 		char *value;
@@ -313,7 +311,6 @@ char *get_fdinfo(int fd, const char *key)
 
 		value = strchr(line, '\t');
 		if (!value || !value[1]) {
-			p_err("malformed fdinfo!?");
 			free(line);
 			return NULL;
 		}
@@ -326,7 +323,6 @@ char *get_fdinfo(int fd, const char *key)
 		return line;
 	}
 
-	p_err("key '%s' not found in fdinfo", key);
 	free(line);
 	fclose(fdi);
 	return NULL;
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 29a3468c6cf60..1ef1ee2280a28 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -513,10 +513,9 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
 				jsonw_uint_field(json_wtr, "owner_prog_type",
 						 prog_type);
 		}
-		if (atoi(owner_jited))
-			jsonw_bool_field(json_wtr, "owner_jited", true);
-		else
-			jsonw_bool_field(json_wtr, "owner_jited", false);
+		if (owner_jited)
+			jsonw_bool_field(json_wtr, "owner_jited",
+					 !!atoi(owner_jited));
 
 		free(owner_prog_type);
 		free(owner_jited);
@@ -569,7 +568,8 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info)
 		char *owner_prog_type = get_fdinfo(fd, "owner_prog_type");
 		char *owner_jited = get_fdinfo(fd, "owner_jited");
 
-		printf("\n\t");
+		if (owner_prog_type || owner_jited)
+			printf("\n\t");
 		if (owner_prog_type) {
 			unsigned int prog_type = atoi(owner_prog_type);
 
@@ -579,10 +579,9 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info)
 			else
 				printf("owner_prog_type %d  ", prog_type);
 		}
-		if (atoi(owner_jited))
-			printf("owner jited");
-		else
-			printf("owner not jited");
+		if (owner_jited)
+			printf("owner%s jited",
+			       atoi(owner_jited) ? "" : " not");
 
 		free(owner_prog_type);
 		free(owner_jited);

From 1d79895aef18fa05789995d86d523c9b2ee58a02 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Mon, 28 Jan 2019 10:13:35 +0100
Subject: [PATCH 05/13] sk_msg: Always cancel strp work before freeing the
 psock

Despite having stopped the parser, we still need to deinitialize it
by calling strp_done so that it cancels its work. Otherwise the worker
thread can run after we have freed the parser, and attempt to access
its workqueue resulting in a use-after-free:

==================================================================
BUG: KASAN: use-after-free in pwq_activate_delayed_work+0x1b/0x1d0
Read of size 8 at addr ffff888069975240 by task kworker/u2:2/93

CPU: 0 PID: 93 Comm: kworker/u2:2 Not tainted 5.0.0-rc2-00335-g28f9d1a3d4fe-dirty #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014
Workqueue:            (null) (kstrp)
Call Trace:
 print_address_description+0x6e/0x2b0
 ? pwq_activate_delayed_work+0x1b/0x1d0
 kasan_report+0xfd/0x177
 ? pwq_activate_delayed_work+0x1b/0x1d0
 ? pwq_activate_delayed_work+0x1b/0x1d0
 pwq_activate_delayed_work+0x1b/0x1d0
 ? process_one_work+0x4aa/0x660
 pwq_dec_nr_in_flight+0x9b/0x100
 worker_thread+0x82/0x680
 ? process_one_work+0x660/0x660
 kthread+0x1b9/0x1e0
 ? __kthread_create_on_node+0x250/0x250
 ret_from_fork+0x1f/0x30

Allocated by task 111:
 sk_psock_init+0x3c/0x1b0
 sock_map_link.isra.2+0x103/0x4b0
 sock_map_update_common+0x94/0x270
 sock_map_update_elem+0x145/0x160
 __se_sys_bpf+0x152e/0x1e10
 do_syscall_64+0xb2/0x3e0
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Freed by task 112:
 kfree+0x7f/0x140
 process_one_work+0x40b/0x660
 worker_thread+0x82/0x680
 kthread+0x1b9/0x1e0
 ret_from_fork+0x1f/0x30

The buggy address belongs to the object at ffff888069975180
 which belongs to the cache kmalloc-512 of size 512
The buggy address is located 192 bytes inside of
 512-byte region [ffff888069975180, ffff888069975380)
The buggy address belongs to the page:
page:ffffea0001a65d00 count:1 mapcount:0 mapping:ffff88806d401280 index:0x0 compound_mapcount: 0
flags: 0x4000000000010200(slab|head)
raw: 4000000000010200 dead000000000100 dead000000000200 ffff88806d401280
raw: 0000000000000000 00000000800c000c 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff888069975100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff888069975180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff888069975200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                           ^
 ffff888069975280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff888069975300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

Reported-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/netdev/CAJPywTLwgXNEZ2dZVoa=udiZmtrWJ0q5SuBW64aYs0Y1khXX3A@mail.gmail.com
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/skmsg.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index d6d5c20d7044c..8c826603bf363 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -545,8 +545,7 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
 	struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
 
 	/* No sk_callback_lock since already detached. */
-	if (psock->parser.enabled)
-		strp_done(&psock->parser.strp);
+	strp_done(&psock->parser.strp);
 
 	cancel_work_sync(&psock->work);
 

From 2fa53f8924223e81cd85e19f0c2caf110f086752 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 28 Jan 2019 23:55:26 +0100
Subject: [PATCH 06/13] bpf, doc: add reviewers to maintainers entry

In order to better scale BPF development on netdev, we've adopted a
reviewer rotation for all BPF patches among the five of us for some
time now. Lets give credit where credit is due, and add Martin, Song
and Yonghong as official BPF reviewers to MAINTAINERS file. Also
while at it, add regex matching for BPF such that we get properly
Cc'ed for files not listed here.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 MAINTAINERS | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 51029a425dbec..6e13378f97460 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2848,6 +2848,9 @@ F:	include/uapi/linux/if_bonding.h
 BPF (Safe dynamic programs and tools)
 M:	Alexei Starovoitov <ast@kernel.org>
 M:	Daniel Borkmann <daniel@iogearbox.net>
+R:	Martin KaFai Lau <kafai@fb.com>
+R:	Song Liu <songliubraving@fb.com>
+R:	Yonghong Song <yhs@fb.com>
 L:	netdev@vger.kernel.org
 L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
@@ -2873,6 +2876,8 @@ F:	samples/bpf/
 F:	tools/bpf/
 F:	tools/lib/bpf/
 F:	tools/testing/selftests/bpf/
+K:	bpf
+N:	bpf
 
 BPF JIT for ARM
 M:	Shubham Bansal <illusionist.neo@gmail.com>

From 81f5c6f5db37bf2360b64c304b27b8f499b48367 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 29 Jan 2019 16:38:16 -0800
Subject: [PATCH 07/13] bpf: btf: allow typedef func_proto

Current implementation does not allow typedef func_proto.
But it is actually allowed.
  -bash-4.4$ cat t.c
  typedef int (f) (int);
  f *g;
  -bash-4.4$ clang -O2 -g -c -target bpf t.c -Xclang -target-feature -Xclang +dwarfris
  -bash-4.4$ pahole -JV t.o
  File t.o:
  [1] PTR (anon) type_id=2
  [2] TYPEDEF f type_id=3
  [3] FUNC_PROTO (anon) return=4 args=(4 (anon))
  [4] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  -bash-4.4$

This patch related btf verifier to allow such (typedef func_proto)
patterns.

Fixes: 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index befe570be5ba2..c57bd10340ed8 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1459,7 +1459,8 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 
 		/* "typedef void new_void", "const void"...etc */
 		if (!btf_type_is_void(next_type) &&
-		    !btf_type_is_fwd(next_type)) {
+		    !btf_type_is_fwd(next_type) &&
+		    !btf_type_is_func_proto(next_type)) {
 			btf_verifier_log_type(env, v->t, "Invalid type_id");
 			return -EINVAL;
 		}

From 32b750b925040565bbeff1fe2f7510e3dc71ce2e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 29 Jan 2019 16:38:17 -0800
Subject: [PATCH 08/13] tools/bpf: fix test_btf for typedef func_proto case

Fixed one test_btf raw test such that typedef func_proto
is permitted now.

Fixes: 78a2540e8945 ("tools/bpf: Add tests for BTF_KIND_FUNC_PROTO and BTF_KIND_FUNC")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_btf.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index a0bd04befe87f..91420fa83b088 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -1881,13 +1881,12 @@ static struct btf_raw_test raw_tests[] = {
 },
 
 {
-	.descr = "func proto (CONST=>TYPEDEF=>FUNC_PROTO)",
+	.descr = "func proto (TYPEDEF=>FUNC_PROTO)",
 	.raw_types = {
 		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
 		BTF_TYPE_INT_ENC(0, 0, 0, 32, 4),		/* [2] */
-		BTF_CONST_ENC(4),				/* [3] */
-		BTF_TYPEDEF_ENC(NAME_TBD, 5),			/* [4] */
-		BTF_FUNC_PROTO_ENC(0, 2),			/* [5] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),			/* [3] */
+		BTF_FUNC_PROTO_ENC(0, 2),			/* [4] */
 			BTF_FUNC_PROTO_ARG_ENC(0, 1),
 			BTF_FUNC_PROTO_ARG_ENC(0, 2),
 		BTF_END_RAW,
@@ -1901,8 +1900,6 @@ static struct btf_raw_test raw_tests[] = {
 	.key_type_id = 1,
 	.value_type_id = 1,
 	.max_entries = 4,
-	.btf_load_err = true,
-	.err_str = "Invalid type_id",
 },
 
 {

From 1bb54c4071f585ebef56ce8fdfe6026fa2cbcddd Mon Sep 17 00:00:00 2001
From: Martynas Pumputis <m@lambda.lt>
Date: Thu, 31 Jan 2019 10:19:33 +0100
Subject: [PATCH 09/13] bpf, selftests: fix handling of sparse CPU allocations

Previously, bpf_num_possible_cpus() had a bug when calculating a
number of possible CPUs in the case of sparse CPU allocations, as
it was considering only the first range or element of
/sys/devices/system/cpu/possible.

E.g. in the case of "0,2-3" (CPU 1 is not available), the function
returned 1 instead of 3.

This patch fixes the function by making it parse all CPU ranges and
elements.

Signed-off-by: Martynas Pumputis <m@lambda.lt>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/bpf_util.h | 30 +++++++++++++++++---------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h
index 315a44fa32af3..84fd6f1bf33e7 100644
--- a/tools/testing/selftests/bpf/bpf_util.h
+++ b/tools/testing/selftests/bpf/bpf_util.h
@@ -13,7 +13,7 @@ static inline unsigned int bpf_num_possible_cpus(void)
 	unsigned int start, end, possible_cpus = 0;
 	char buff[128];
 	FILE *fp;
-	int n;
+	int len, n, i, j = 0;
 
 	fp = fopen(fcpu, "r");
 	if (!fp) {
@@ -21,17 +21,27 @@ static inline unsigned int bpf_num_possible_cpus(void)
 		exit(1);
 	}
 
-	while (fgets(buff, sizeof(buff), fp)) {
-		n = sscanf(buff, "%u-%u", &start, &end);
-		if (n == 0) {
-			printf("Failed to retrieve # possible CPUs!\n");
-			exit(1);
-		} else if (n == 1) {
-			end = start;
+	if (!fgets(buff, sizeof(buff), fp)) {
+		printf("Failed to read %s!\n", fcpu);
+		exit(1);
+	}
+
+	len = strlen(buff);
+	for (i = 0; i <= len; i++) {
+		if (buff[i] == ',' || buff[i] == '\0') {
+			buff[i] = '\0';
+			n = sscanf(&buff[j], "%u-%u", &start, &end);
+			if (n <= 0) {
+				printf("Failed to retrieve # possible CPUs!\n");
+				exit(1);
+			} else if (n == 1) {
+				end = start;
+			}
+			possible_cpus += end - start + 1;
+			j = i + 1;
 		}
-		possible_cpus = start == 0 ? end + 1 : 0;
-		break;
 	}
+
 	fclose(fp);
 
 	return possible_cpus;

From 6cab5e90ab2bd323c9f3811b6c70a4687df51e27 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 28 Jan 2019 18:43:34 -0800
Subject: [PATCH 10/13] bpf: run bpf programs with preemption disabled

Disabled preemption is necessary for proper access to per-cpu maps
from BPF programs.

But the sender side of socket filters didn't have preemption disabled:
unix_dgram_sendmsg->sk_filter->sk_filter_trim_cap->bpf_prog_run_save_cb->BPF_PROG_RUN

and a combination of af_packet with tun device didn't disable either:
tpacket_snd->packet_direct_xmit->packet_pick_tx_queue->ndo_select_queue->
  tun_select_queue->tun_ebpf_select_queue->bpf_prog_run_clear_cb->BPF_PROG_RUN

Disable preemption before executing BPF programs (both classic and extended).

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h | 21 ++++++++++++++++++---
 kernel/bpf/cgroup.c    |  2 +-
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ad106d845b229..e532fcc6e4b5a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -591,8 +591,8 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 	return qdisc_skb_cb(skb)->data;
 }
 
-static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
-				       struct sk_buff *skb)
+static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
+					 struct sk_buff *skb)
 {
 	u8 *cb_data = bpf_skb_cb(skb);
 	u8 cb_saved[BPF_SKB_CB_LEN];
@@ -611,15 +611,30 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
 	return res;
 }
 
+static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
+				       struct sk_buff *skb)
+{
+	u32 res;
+
+	preempt_disable();
+	res = __bpf_prog_run_save_cb(prog, skb);
+	preempt_enable();
+	return res;
+}
+
 static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 					struct sk_buff *skb)
 {
 	u8 *cb_data = bpf_skb_cb(skb);
+	u32 res;
 
 	if (unlikely(prog->cb_access))
 		memset(cb_data, 0, BPF_SKB_CB_LEN);
 
-	return BPF_PROG_RUN(prog, skb);
+	preempt_disable();
+	res = BPF_PROG_RUN(prog, skb);
+	preempt_enable();
+	return res;
 }
 
 static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ab612fe9862f0..d17d05570a3fd 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -572,7 +572,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	bpf_compute_and_save_data_end(skb, &saved_data_end);
 
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
-				 bpf_prog_run_save_cb);
+				 __bpf_prog_run_save_cb);
 	bpf_restore_data_end(skb, saved_data_end);
 	__skb_pull(skb, offset);
 	skb->sk = save_sk;

From a89fac57b5d080771efd4d71feaae19877cf68f0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 30 Jan 2019 18:12:43 -0800
Subject: [PATCH 11/13] bpf: fix lockdep false positive in percpu_freelist

Lockdep warns about false positive:
[   12.492084] 00000000e6b28347 (&head->lock){+...}, at: pcpu_freelist_push+0x2a/0x40
[   12.492696] but this lock was taken by another, HARDIRQ-safe lock in the past:
[   12.493275]  (&rq->lock){-.-.}
[   12.493276]
[   12.493276]
[   12.493276] and interrupts could create inverse lock ordering between them.
[   12.493276]
[   12.494435]
[   12.494435] other info that might help us debug this:
[   12.494979]  Possible interrupt unsafe locking scenario:
[   12.494979]
[   12.495518]        CPU0                    CPU1
[   12.495879]        ----                    ----
[   12.496243]   lock(&head->lock);
[   12.496502]                                local_irq_disable();
[   12.496969]                                lock(&rq->lock);
[   12.497431]                                lock(&head->lock);
[   12.497890]   <Interrupt>
[   12.498104]     lock(&rq->lock);
[   12.498368]
[   12.498368]  *** DEADLOCK ***
[   12.498368]
[   12.498837] 1 lock held by dd/276:
[   12.499110]  #0: 00000000c58cb2ee (rcu_read_lock){....}, at: trace_call_bpf+0x5e/0x240
[   12.499747]
[   12.499747] the shortest dependencies between 2nd lock and 1st lock:
[   12.500389]  -> (&rq->lock){-.-.} {
[   12.500669]     IN-HARDIRQ-W at:
[   12.500934]                       _raw_spin_lock+0x2f/0x40
[   12.501373]                       scheduler_tick+0x4c/0xf0
[   12.501812]                       update_process_times+0x40/0x50
[   12.502294]                       tick_periodic+0x27/0xb0
[   12.502723]                       tick_handle_periodic+0x1f/0x60
[   12.503203]                       timer_interrupt+0x11/0x20
[   12.503651]                       __handle_irq_event_percpu+0x43/0x2c0
[   12.504167]                       handle_irq_event_percpu+0x20/0x50
[   12.504674]                       handle_irq_event+0x37/0x60
[   12.505139]                       handle_level_irq+0xa7/0x120
[   12.505601]                       handle_irq+0xa1/0x150
[   12.506018]                       do_IRQ+0x77/0x140
[   12.506411]                       ret_from_intr+0x0/0x1d
[   12.506834]                       _raw_spin_unlock_irqrestore+0x53/0x60
[   12.507362]                       __setup_irq+0x481/0x730
[   12.507789]                       setup_irq+0x49/0x80
[   12.508195]                       hpet_time_init+0x21/0x32
[   12.508644]                       x86_late_time_init+0xb/0x16
[   12.509106]                       start_kernel+0x390/0x42a
[   12.509554]                       secondary_startup_64+0xa4/0xb0
[   12.510034]     IN-SOFTIRQ-W at:
[   12.510305]                       _raw_spin_lock+0x2f/0x40
[   12.510772]                       try_to_wake_up+0x1c7/0x4e0
[   12.511220]                       swake_up_locked+0x20/0x40
[   12.511657]                       swake_up_one+0x1a/0x30
[   12.512070]                       rcu_process_callbacks+0xc5/0x650
[   12.512553]                       __do_softirq+0xe6/0x47b
[   12.512978]                       irq_exit+0xc3/0xd0
[   12.513372]                       smp_apic_timer_interrupt+0xa9/0x250
[   12.513876]                       apic_timer_interrupt+0xf/0x20
[   12.514343]                       default_idle+0x1c/0x170
[   12.514765]                       do_idle+0x199/0x240
[   12.515159]                       cpu_startup_entry+0x19/0x20
[   12.515614]                       start_kernel+0x422/0x42a
[   12.516045]                       secondary_startup_64+0xa4/0xb0
[   12.516521]     INITIAL USE at:
[   12.516774]                      _raw_spin_lock_irqsave+0x38/0x50
[   12.517258]                      rq_attach_root+0x16/0xd0
[   12.517685]                      sched_init+0x2f2/0x3eb
[   12.518096]                      start_kernel+0x1fb/0x42a
[   12.518525]                      secondary_startup_64+0xa4/0xb0
[   12.518986]   }
[   12.519132]   ... key      at: [<ffffffff82b7bc28>] __key.71384+0x0/0x8
[   12.519649]   ... acquired at:
[   12.519892]    pcpu_freelist_pop+0x7b/0xd0
[   12.520221]    bpf_get_stackid+0x1d2/0x4d0
[   12.520563]    ___bpf_prog_run+0x8b4/0x11a0
[   12.520887]
[   12.521008] -> (&head->lock){+...} {
[   12.521292]    HARDIRQ-ON-W at:
[   12.521539]                     _raw_spin_lock+0x2f/0x40
[   12.521950]                     pcpu_freelist_push+0x2a/0x40
[   12.522396]                     bpf_get_stackid+0x494/0x4d0
[   12.522828]                     ___bpf_prog_run+0x8b4/0x11a0
[   12.523296]    INITIAL USE at:
[   12.523537]                    _raw_spin_lock+0x2f/0x40
[   12.523944]                    pcpu_freelist_populate+0xc0/0x120
[   12.524417]                    htab_map_alloc+0x405/0x500
[   12.524835]                    __do_sys_bpf+0x1a3/0x1a90
[   12.525253]                    do_syscall_64+0x4a/0x180
[   12.525659]                    entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   12.526167]  }
[   12.526311]  ... key      at: [<ffffffff838f7668>] __key.13130+0x0/0x8
[   12.526812]  ... acquired at:
[   12.527047]    __lock_acquire+0x521/0x1350
[   12.527371]    lock_acquire+0x98/0x190
[   12.527680]    _raw_spin_lock+0x2f/0x40
[   12.527994]    pcpu_freelist_push+0x2a/0x40
[   12.528325]    bpf_get_stackid+0x494/0x4d0
[   12.528645]    ___bpf_prog_run+0x8b4/0x11a0
[   12.528970]
[   12.529092]
[   12.529092] stack backtrace:
[   12.529444] CPU: 0 PID: 276 Comm: dd Not tainted 5.0.0-rc3-00018-g2fa53f892422 #475
[   12.530043] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
[   12.530750] Call Trace:
[   12.530948]  dump_stack+0x5f/0x8b
[   12.531248]  check_usage_backwards+0x10c/0x120
[   12.531598]  ? ___bpf_prog_run+0x8b4/0x11a0
[   12.531935]  ? mark_lock+0x382/0x560
[   12.532229]  mark_lock+0x382/0x560
[   12.532496]  ? print_shortest_lock_dependencies+0x180/0x180
[   12.532928]  __lock_acquire+0x521/0x1350
[   12.533271]  ? find_get_entry+0x17f/0x2e0
[   12.533586]  ? find_get_entry+0x19c/0x2e0
[   12.533902]  ? lock_acquire+0x98/0x190
[   12.534196]  lock_acquire+0x98/0x190
[   12.534482]  ? pcpu_freelist_push+0x2a/0x40
[   12.534810]  _raw_spin_lock+0x2f/0x40
[   12.535099]  ? pcpu_freelist_push+0x2a/0x40
[   12.535432]  pcpu_freelist_push+0x2a/0x40
[   12.535750]  bpf_get_stackid+0x494/0x4d0
[   12.536062]  ___bpf_prog_run+0x8b4/0x11a0

It has been explained that is a false positive here:
https://lkml.org/lkml/2018/7/25/756
Recap:
- stackmap uses pcpu_freelist
- The lock in pcpu_freelist is a percpu lock
- stackmap is only used by tracing bpf_prog
- A tracing bpf_prog cannot be run if another bpf_prog
  has already been running (ensured by the percpu bpf_prog_active counter).

Eric pointed out that this lockdep splats stops other
legit lockdep splats in selftests/bpf/test_progs.c.

Fix this by calling local_irq_save/restore for stackmap.

Another false positive had also been worked around by calling
local_irq_save in commit 89ad2fa3f043 ("bpf: fix lockdep splat").
That commit added unnecessary irq_save/restore to fast path of
bpf hash map. irqs are already disabled at that point, since htab
is holding per bucket spin_lock with irqsave.

Let's reduce overhead for htab by introducing __pcpu_freelist_push/pop
function w/o irqsave and convert pcpu_freelist_push/pop to irqsave
to be used elsewhere (right now only in stackmap).
It stops lockdep false positive in stackmap with a bit of acceptable overhead.

Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation")
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/hashtab.c         |  4 ++--
 kernel/bpf/percpu_freelist.c | 41 +++++++++++++++++++++++++-----------
 kernel/bpf/percpu_freelist.h |  4 ++++
 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d6..f9274114c88d3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -686,7 +686,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	}
 
 	if (htab_is_prealloc(htab)) {
-		pcpu_freelist_push(&htab->freelist, &l->fnode);
+		__pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		atomic_dec(&htab->count);
 		l->htab = htab;
@@ -748,7 +748,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 		} else {
 			struct pcpu_freelist_node *l;
 
-			l = pcpu_freelist_pop(&htab->freelist);
+			l = __pcpu_freelist_pop(&htab->freelist);
 			if (!l)
 				return ERR_PTR(-E2BIG);
 			l_new = container_of(l, struct htab_elem, fnode);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 673fa6fe2d73c..0c1b4ba9e90e7 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
 	free_percpu(s->freelist);
 }
 
-static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
-					struct pcpu_freelist_node *node)
+static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+					 struct pcpu_freelist_node *node)
 {
 	raw_spin_lock(&head->lock);
 	node->next = head->first;
@@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
 	raw_spin_unlock(&head->lock);
 }
 
-void pcpu_freelist_push(struct pcpu_freelist *s,
+void __pcpu_freelist_push(struct pcpu_freelist *s,
 			struct pcpu_freelist_node *node)
 {
 	struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
 
-	__pcpu_freelist_push(head, node);
+	___pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+			struct pcpu_freelist_node *node)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__pcpu_freelist_push(s, node);
+	local_irq_restore(flags);
 }
 
 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
@@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 	for_each_possible_cpu(cpu) {
 again:
 		head = per_cpu_ptr(s->freelist, cpu);
-		__pcpu_freelist_push(head, buf);
+		___pcpu_freelist_push(head, buf);
 		i++;
 		buf += elem_size;
 		if (i == nr_elems)
@@ -74,14 +84,12 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 	local_irq_restore(flags);
 }
 
-struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
-	unsigned long flags;
 	int orig_cpu, cpu;
 
-	local_irq_save(flags);
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
@@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 		node = head->first;
 		if (node) {
 			head->first = node->next;
-			raw_spin_unlock_irqrestore(&head->lock, flags);
+			raw_spin_unlock(&head->lock);
 			return node;
 		}
 		raw_spin_unlock(&head->lock);
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
-		if (cpu == orig_cpu) {
-			local_irq_restore(flags);
+		if (cpu == orig_cpu)
 			return NULL;
-		}
 	}
 }
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+	struct pcpu_freelist_node *ret;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ret = __pcpu_freelist_pop(s);
+	local_irq_restore(flags);
+	return ret;
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3049aae8ea1e2..c3960118e6178 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -22,8 +22,12 @@ struct pcpu_freelist_node {
 	struct pcpu_freelist_node *next;
 };
 
+/* pcpu_freelist_* do spin_lock_irqsave. */
 void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
+void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 			    u32 nr_elems);
 int pcpu_freelist_init(struct pcpu_freelist *);

From e16ec34039c701594d55d08a5aa49ee3e1abc821 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 30 Jan 2019 18:12:44 -0800
Subject: [PATCH 12/13] bpf: fix potential deadlock in bpf_prog_register

Lockdep found a potential deadlock between cpu_hotplug_lock, bpf_event_mutex, and cpuctx_mutex:
[   13.007000] WARNING: possible circular locking dependency detected
[   13.007587] 5.0.0-rc3-00018-g2fa53f892422-dirty #477 Not tainted
[   13.008124] ------------------------------------------------------
[   13.008624] test_progs/246 is trying to acquire lock:
[   13.009030] 0000000094160d1d (tracepoints_mutex){+.+.}, at: tracepoint_probe_register_prio+0x2d/0x300
[   13.009770]
[   13.009770] but task is already holding lock:
[   13.010239] 00000000d663ef86 (bpf_event_mutex){+.+.}, at: bpf_probe_register+0x1d/0x60
[   13.010877]
[   13.010877] which lock already depends on the new lock.
[   13.010877]
[   13.011532]
[   13.011532] the existing dependency chain (in reverse order) is:
[   13.012129]
[   13.012129] -> #4 (bpf_event_mutex){+.+.}:
[   13.012582]        perf_event_query_prog_array+0x9b/0x130
[   13.013016]        _perf_ioctl+0x3aa/0x830
[   13.013354]        perf_ioctl+0x2e/0x50
[   13.013668]        do_vfs_ioctl+0x8f/0x6a0
[   13.014003]        ksys_ioctl+0x70/0x80
[   13.014320]        __x64_sys_ioctl+0x16/0x20
[   13.014668]        do_syscall_64+0x4a/0x180
[   13.015007]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   13.015469]
[   13.015469] -> #3 (&cpuctx_mutex){+.+.}:
[   13.015910]        perf_event_init_cpu+0x5a/0x90
[   13.016291]        perf_event_init+0x1b2/0x1de
[   13.016654]        start_kernel+0x2b8/0x42a
[   13.016995]        secondary_startup_64+0xa4/0xb0
[   13.017382]
[   13.017382] -> #2 (pmus_lock){+.+.}:
[   13.017794]        perf_event_init_cpu+0x21/0x90
[   13.018172]        cpuhp_invoke_callback+0xb3/0x960
[   13.018573]        _cpu_up+0xa7/0x140
[   13.018871]        do_cpu_up+0xa4/0xc0
[   13.019178]        smp_init+0xcd/0xd2
[   13.019483]        kernel_init_freeable+0x123/0x24f
[   13.019878]        kernel_init+0xa/0x110
[   13.020201]        ret_from_fork+0x24/0x30
[   13.020541]
[   13.020541] -> #1 (cpu_hotplug_lock.rw_sem){++++}:
[   13.021051]        static_key_slow_inc+0xe/0x20
[   13.021424]        tracepoint_probe_register_prio+0x28c/0x300
[   13.021891]        perf_trace_event_init+0x11f/0x250
[   13.022297]        perf_trace_init+0x6b/0xa0
[   13.022644]        perf_tp_event_init+0x25/0x40
[   13.023011]        perf_try_init_event+0x6b/0x90
[   13.023386]        perf_event_alloc+0x9a8/0xc40
[   13.023754]        __do_sys_perf_event_open+0x1dd/0xd30
[   13.024173]        do_syscall_64+0x4a/0x180
[   13.024519]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   13.024968]
[   13.024968] -> #0 (tracepoints_mutex){+.+.}:
[   13.025434]        __mutex_lock+0x86/0x970
[   13.025764]        tracepoint_probe_register_prio+0x2d/0x300
[   13.026215]        bpf_probe_register+0x40/0x60
[   13.026584]        bpf_raw_tracepoint_open.isra.34+0xa4/0x130
[   13.027042]        __do_sys_bpf+0x94f/0x1a90
[   13.027389]        do_syscall_64+0x4a/0x180
[   13.027727]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   13.028171]
[   13.028171] other info that might help us debug this:
[   13.028171]
[   13.028807] Chain exists of:
[   13.028807]   tracepoints_mutex --> &cpuctx_mutex --> bpf_event_mutex
[   13.028807]
[   13.029666]  Possible unsafe locking scenario:
[   13.029666]
[   13.030140]        CPU0                    CPU1
[   13.030510]        ----                    ----
[   13.030875]   lock(bpf_event_mutex);
[   13.031166]                                lock(&cpuctx_mutex);
[   13.031645]                                lock(bpf_event_mutex);
[   13.032135]   lock(tracepoints_mutex);
[   13.032441]
[   13.032441]  *** DEADLOCK ***
[   13.032441]
[   13.032911] 1 lock held by test_progs/246:
[   13.033239]  #0: 00000000d663ef86 (bpf_event_mutex){+.+.}, at: bpf_probe_register+0x1d/0x60
[   13.033909]
[   13.033909] stack backtrace:
[   13.034258] CPU: 1 PID: 246 Comm: test_progs Not tainted 5.0.0-rc3-00018-g2fa53f892422-dirty #477
[   13.034964] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
[   13.035657] Call Trace:
[   13.035859]  dump_stack+0x5f/0x8b
[   13.036130]  print_circular_bug.isra.37+0x1ce/0x1db
[   13.036526]  __lock_acquire+0x1158/0x1350
[   13.036852]  ? lock_acquire+0x98/0x190
[   13.037154]  lock_acquire+0x98/0x190
[   13.037447]  ? tracepoint_probe_register_prio+0x2d/0x300
[   13.037876]  __mutex_lock+0x86/0x970
[   13.038167]  ? tracepoint_probe_register_prio+0x2d/0x300
[   13.038600]  ? tracepoint_probe_register_prio+0x2d/0x300
[   13.039028]  ? __mutex_lock+0x86/0x970
[   13.039337]  ? __mutex_lock+0x24a/0x970
[   13.039649]  ? bpf_probe_register+0x1d/0x60
[   13.039992]  ? __bpf_trace_sched_wake_idle_without_ipi+0x10/0x10
[   13.040478]  ? tracepoint_probe_register_prio+0x2d/0x300
[   13.040906]  tracepoint_probe_register_prio+0x2d/0x300
[   13.041325]  bpf_probe_register+0x40/0x60
[   13.041649]  bpf_raw_tracepoint_open.isra.34+0xa4/0x130
[   13.042068]  ? __might_fault+0x3e/0x90
[   13.042374]  __do_sys_bpf+0x94f/0x1a90
[   13.042678]  do_syscall_64+0x4a/0x180
[   13.042975]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   13.043382] RIP: 0033:0x7f23b10a07f9
[   13.045155] RSP: 002b:00007ffdef42fdd8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141
[   13.045759] RAX: ffffffffffffffda RBX: 00007ffdef42ff70 RCX: 00007f23b10a07f9
[   13.046326] RDX: 0000000000000070 RSI: 00007ffdef42fe10 RDI: 0000000000000011
[   13.046893] RBP: 00007ffdef42fdf0 R08: 0000000000000038 R09: 00007ffdef42fe10
[   13.047462] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
[   13.048029] R13: 0000000000000016 R14: 00007f23b1db4690 R15: 0000000000000000

Since tracepoints_mutex will be taken in tracepoint_probe_register/unregister()
there is no need to take bpf_event_mutex too.
bpf_event_mutex is protecting modifications to prog array used in kprobe/perf bpf progs.
bpf_raw_tracepoints don't need to take this mutex.

Fixes: c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8b068adb9da1c..f1a86a0d881dd 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1204,22 +1204,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
 
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 {
-	int err;
-
-	mutex_lock(&bpf_event_mutex);
-	err = __bpf_probe_register(btp, prog);
-	mutex_unlock(&bpf_event_mutex);
-	return err;
+	return __bpf_probe_register(btp, prog);
 }
 
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 {
-	int err;
-
-	mutex_lock(&bpf_event_mutex);
-	err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
-	mutex_unlock(&bpf_event_mutex);
-	return err;
+	return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
 }
 
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,

From 7c4cd051add3d00bbff008a133c936c515eaa8fe Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 30 Jan 2019 18:12:45 -0800
Subject: [PATCH 13/13] bpf: Fix syscall's stackmap lookup potential deadlock

The map_lookup_elem used to not acquiring spinlock
in order to optimize the reader.

It was true until commit 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation")
The syscall's map_lookup_elem(stackmap) calls bpf_stackmap_copy().
bpf_stackmap_copy() may find the elem no longer needed after the copy is done.
If that is the case, pcpu_freelist_push() saves this elem for reuse later.
This push requires a spinlock.

If a tracing bpf_prog got run in the middle of the syscall's
map_lookup_elem(stackmap) and this tracing bpf_prog is calling
bpf_get_stackid(stackmap) which also requires the same pcpu_freelist's
spinlock, it may end up with a dead lock situation as reported by
Eric Dumazet in https://patchwork.ozlabs.org/patch/1030266/

The situation is the same as the syscall's map_update_elem() which
needs to acquire the pcpu_freelist's spinlock and could race
with tracing bpf_prog.  Hence, this patch fixes it by protecting
bpf_stackmap_copy() with this_cpu_inc(bpf_prog_active)
to prevent tracing bpf_prog from running.

A later syscall's map_lookup_elem commit f1a2e44a3aec ("bpf: add queue and stack maps")
also acquires a spinlock and races with tracing bpf_prog similarly.
Hence, this patch is forward looking and protects the majority
of the map lookups.  bpf_map_offload_lookup_elem() is the exception
since it is for network bpf_prog only (i.e. never called by tracing
bpf_prog).

Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd7..8577bb7f8be67 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -713,8 +713,13 @@ static int map_lookup_elem(union bpf_attr *attr)
 
 	if (bpf_map_is_dev_bound(map)) {
 		err = bpf_map_offload_lookup_elem(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		goto done;
+	}
+
+	preempt_disable();
+	this_cpu_inc(bpf_prog_active);
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
@@ -744,7 +749,10 @@ static int map_lookup_elem(union bpf_attr *attr)
 		}
 		rcu_read_unlock();
 	}
+	this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 
+done:
 	if (err)
 		goto free_value;