From cf094baa3e0f19f1f80ceaf205c80402b024386c Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 18 Sep 2023 23:02:57 -0700
Subject: [PATCH 01/11] s390/bpf: Let arch_prepare_bpf_trampoline return
 program size

arch_prepare_bpf_trampoline() for s390 currently returns 0 on success. This
is not a problem for regular trampoline. However, struct_ops relies on the
return value to advance "image" pointer:

bpf_struct_ops_map_update_elem() {
    ...
    for_each_member(i, t, member) {
        ...
        err = bpf_struct_ops_prepare_trampoline();
        ...
        image += err;
    }
}

When arch_prepare_bpf_trampoline returns 0 on success, all members of the
struct_ops will point to the same trampoline (the last one).

Fix this by returning the program size in arch_prepare_bpf_trampoline (on
success). This is the same behavior as other architectures.

Signed-off-by: Song Liu <song@kernel.org>
Fixes: 528eb2cb87bc ("s390/bpf: Implement arch_prepare_bpf_trampoline()")
Reviewed-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230919060258.3237176-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/s390/net/bpf_jit_comp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index de2fb12120d2e..2861e3360affc 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -2513,7 +2513,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 			return -E2BIG;
 	}
 
-	return ret;
+	return tjit.common.prg;
 }
 
 bool bpf_jit_supports_subprog_tailcalls(void)

From 48f5e7d3f7300ff679dc50bfb7a7451de6f29e4c Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 18 Sep 2023 23:02:58 -0700
Subject: [PATCH 02/11] selftests/bpf: Check bpf_cubic_acked() is called via
 struct_ops

Test bpf_tcp_ca (in test_progs) checks multiple tcp_congestion_ops.
However, there isn't a test that verifies functions in the
tcp_congestion_ops is actually called. Add a check to verify that
bpf_cubic_acked is actually called during the test.

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230919060258.3237176-3-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 ++
 tools/testing/selftests/bpf/progs/bpf_cubic.c       | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index a53c254c60580..4aabeaa525d47 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -185,6 +185,8 @@ static void test_cubic(void)
 
 	do_test("bpf_cubic", NULL);
 
+	ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called");
+
 	bpf_link__destroy(link);
 	bpf_cubic__destroy(cubic_skel);
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c
index d9660e7200e2d..c997e3e3d3fb4 100644
--- a/tools/testing/selftests/bpf/progs/bpf_cubic.c
+++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c
@@ -490,6 +490,8 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay)
 	}
 }
 
+int bpf_cubic_acked_called = 0;
+
 void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
 		    const struct ack_sample *sample)
 {
@@ -497,6 +499,7 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
 	struct bictcp *ca = inet_csk_ca(sk);
 	__u32 delay;
 
+	bpf_cubic_acked_called = 1;
 	/* Some calls are for duplicates without timetamps */
 	if (sample->rtt_us < 0)
 		return;

From b724a6418f1f853bcb39c8923bf14a50c7bdbd07 Mon Sep 17 00:00:00 2001
From: Leon Hwang <hffilwlqm@gmail.com>
Date: Sun, 17 Sep 2023 23:38:46 +0800
Subject: [PATCH 03/11] bpf: Fix tr dereferencing

Fix 'tr' dereferencing bug when CONFIG_BPF_JIT is turned off.

When CONFIG_BPF_JIT is turned off, 'bpf_trampoline_get()' returns NULL,
which is same as the cases when CONFIG_BPF_JIT is turned on.

Closes: https://lore.kernel.org/r/202309131936.5Nc8eUD0-lkp@intel.com/
Fixes: f7b12b6fea00 ("bpf: verifier: refactor check_attach_btf_id()")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230917153846.88732-1-hffilwlqm@gmail.com
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 024e8b28c34b8..49f8b691496c4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1307,7 +1307,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
 static inline struct bpf_trampoline *bpf_trampoline_get(u64 key,
 							struct bpf_attach_target_info *tgt_info)
 {
-	return ERR_PTR(-EOPNOTSUPP);
+	return NULL;
 }
 static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
 #define DEFINE_BPF_DISPATCHER(name)

From 81335f90e8a88b81932df011105c46e708744f44 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 18 Sep 2023 14:01:10 -0700
Subject: [PATCH 04/11] bpf: unconditionally reset backtrack_state masks on
 global func exit

In mark_chain_precision() logic, when we reach the entry to a global
func, it is expected that R1-R5 might be still requested to be marked
precise. This would correspond to some integer input arguments being
tracked as precise. This is all expected and handled as a special case.

What's not expected is that we'll leave backtrack_state structure with
some register bits set. This is because for subsequent precision
propagations backtrack_state is reused without clearing masks, as all
code paths are carefully written in a way to leave empty backtrack_state
with zeroed out masks, for speed.

The fix is trivial, we always clear register bit in the register mask, and
then, optionally, set reg->precise if register is SCALAR_VALUE type.

Reported-by: Chris Mason <clm@meta.com>
Fixes: be2ef8161572 ("bpf: allow precision tracking for programs with subprogs")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230918210110.2241458-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb78212fa5b27..c0c7d137066ac 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4047,11 +4047,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				bitmap_from_u64(mask, bt_reg_mask(bt));
 				for_each_set_bit(i, mask, 32) {
 					reg = &st->frame[0]->regs[i];
-					if (reg->type != SCALAR_VALUE) {
-						bt_clear_reg(bt, i);
-						continue;
-					}
-					reg->precise = true;
+					bt_clear_reg(bt, i);
+					if (reg->type == SCALAR_VALUE)
+						reg->precise = true;
 				}
 				return 0;
 			}

From 9b7177b1df64b8d7f85700027c324aadd6aded00 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 25 Sep 2023 20:52:58 -0700
Subject: [PATCH 05/11] bpf: tcp_read_skb needs to pop skb regardless of seq

Before fix e5c6de5fa0258 tcp_read_skb() would increment the tp->copied-seq
value. This (as described in the commit) would cause an error for apps
because once that is incremented the application might believe there is no
data to be read. Then some apps would stall or abort believing no data is
available.

However, the fix is incomplete because it introduces another issue in
the skb dequeue. The loop does tcp_recv_skb() in a while loop to consume
as many skbs as possible. The problem is the call is ...

  tcp_recv_skb(sk, seq, &offset)

... where 'seq' is:

  u32 seq = tp->copied_seq;

Now we can hit a case where we've yet incremented copied_seq from BPF side,
but then tcp_recv_skb() fails this test ...

 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))

... so that instead of returning the skb we call tcp_eat_recv_skb() which
frees the skb. This is because the routine believes the SKB has been collapsed
per comment:

 /* This looks weird, but this can happen if TCP collapsing
  * splitted a fat GRO packet, while we released socket lock
  * in skb_splice_bits()
  */

This can't happen here we've unlinked the full SKB and orphaned it. Anyways
it would confuse any BPF programs if the data were suddenly moved underneath
it.

To fix this situation do simpler operation and just skb_peek() the data
of the queue followed by the unlink. It shouldn't need to check this
condition and tcp_read_skb() reads entire skbs so there is no need to
handle the 'offset!=0' case as we would see in tcp_read_sock().

Fixes: e5c6de5fa0258 ("bpf, sockmap: Incorrectly handling copied_seq")
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230926035300.135096-2-john.fastabend@gmail.com
---
 net/ipv4/tcp.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0c3040a63ebdb..3f66cdeef7dec 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1621,16 +1621,13 @@ EXPORT_SYMBOL(tcp_read_sock);
 
 int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	u32 seq = tp->copied_seq;
 	struct sk_buff *skb;
 	int copied = 0;
-	u32 offset;
 
 	if (sk->sk_state == TCP_LISTEN)
 		return -ENOTCONN;
 
-	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
 		u8 tcp_flags;
 		int used;
 
@@ -1643,13 +1640,10 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 				copied = used;
 			break;
 		}
-		seq += used;
 		copied += used;
 
-		if (tcp_flags & TCPHDR_FIN) {
-			++seq;
+		if (tcp_flags & TCPHDR_FIN)
 			break;
-		}
 	}
 	return copied;
 }

From da9e915eaf5dadb1963b7738cdfa42ed55212445 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 25 Sep 2023 20:52:59 -0700
Subject: [PATCH 06/11] bpf, sockmap: Do not inc copied_seq when PEEK flag set

When data is peek'd off the receive queue we shouldn't considered it
copied from tcp_sock side. When we increment copied_seq this will confuse
tcp_data_ready() because copied_seq can be arbitrarily increased. From
application side it results in poll() operations not waking up when
expected.

Notice tcp stack without BPF recvmsg programs also does not increment
copied_seq.

We broke this when we moved copied_seq into recvmsg to only update when
actual copy was happening. But, it wasn't working correctly either before
because the tcp_data_ready() tried to use the copied_seq value to see
if data was read by user yet. See fixes tags.

Fixes: e5c6de5fa0258 ("bpf, sockmap: Incorrectly handling copied_seq")
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230926035300.135096-3-john.fastabend@gmail.com
---
 net/ipv4/tcp_bpf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 81f0dff69e0b6..3272682030015 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -222,6 +222,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
 				  int *addr_len)
 {
 	struct tcp_sock *tcp = tcp_sk(sk);
+	int peek = flags & MSG_PEEK;
 	u32 seq = tcp->copied_seq;
 	struct sk_psock *psock;
 	int copied = 0;
@@ -311,7 +312,8 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
 		copied = -EAGAIN;
 	}
 out:
-	WRITE_ONCE(tcp->copied_seq, seq);
+	if (!peek)
+		WRITE_ONCE(tcp->copied_seq, seq);
 	tcp_rcv_space_adjust(sk);
 	if (copied > 0)
 		__tcp_cleanup_rbuf(sk, copied);

From 5f405c0c0c4651b991c109cf9be33bb996af098e Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 25 Sep 2023 20:53:00 -0700
Subject: [PATCH 07/11] bpf, sockmap: Add tests for MSG_F_PEEK

Test that we can read with MSG_F_PEEK and then still get correct number
of available bytes through FIONREAD. The recv() (without PEEK) then
returns the bytes as expected. The recv() always worked though because
it was just the available byte reporting that was broke before latest
fixes.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230926035300.135096-4-john.fastabend@gmail.com
---
 .../selftests/bpf/prog_tests/sockmap_basic.c  | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 064cc5e8d9ade..dda7060e86a09 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -475,6 +475,55 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
 		test_sockmap_drop_prog__destroy(drop);
 }
 
+static void test_sockmap_skb_verdict_peek(void)
+{
+	int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail;
+	struct test_sockmap_pass_prog *pass;
+	char snd[256] = "0123456789";
+	char rcv[256] = "0";
+
+	pass = test_sockmap_pass_prog__open_and_load();
+	if (!ASSERT_OK_PTR(pass, "open_and_load"))
+		return;
+	verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
+	map = bpf_map__fd(pass->maps.sock_map_rx);
+
+	err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
+		goto out;
+
+	s = socket_loopback(AF_INET, SOCK_STREAM);
+	if (!ASSERT_GT(s, -1, "socket_loopback(s)"))
+		goto out;
+
+	err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1);
+	if (!ASSERT_OK(err, "create_pairs(s)"))
+		goto out;
+
+	err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST);
+	if (!ASSERT_OK(err, "bpf_map_update_elem(c1)"))
+		goto out_close;
+
+	sent = xsend(p1, snd, sizeof(snd), 0);
+	ASSERT_EQ(sent, sizeof(snd), "xsend(p1)");
+	recvd = recv(c1, rcv, sizeof(rcv), MSG_PEEK);
+	ASSERT_EQ(recvd, sizeof(rcv), "recv(c1)");
+	err = ioctl(c1, FIONREAD, &avail);
+	ASSERT_OK(err, "ioctl(FIONREAD) error");
+	ASSERT_EQ(avail, sizeof(snd), "after peek ioctl(FIONREAD)");
+	recvd = recv(c1, rcv, sizeof(rcv), 0);
+	ASSERT_EQ(recvd, sizeof(rcv), "recv(p0)");
+	err = ioctl(c1, FIONREAD, &avail);
+	ASSERT_OK(err, "ioctl(FIONREAD) error");
+	ASSERT_EQ(avail, 0, "after read ioctl(FIONREAD)");
+
+out_close:
+	close(c1);
+	close(p1);
+out:
+	test_sockmap_pass_prog__destroy(pass);
+}
+
 void test_sockmap_basic(void)
 {
 	if (test__start_subtest("sockmap create_update_free"))
@@ -515,4 +564,6 @@ void test_sockmap_basic(void)
 		test_sockmap_skb_verdict_fionread(true);
 	if (test__start_subtest("sockmap skb_verdict fionread on drop"))
 		test_sockmap_skb_verdict_fionread(false);
+	if (test__start_subtest("sockmap skb_verdict msg_f_peek"))
+		test_sockmap_skb_verdict_peek();
 }

From b80e31baa43614e086a9d29dc1151932b1bd7fc5 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 20 Sep 2023 12:20:55 +0200
Subject: [PATCH 08/11] bpf, sockmap: Reject sk_msg egress redirects to non-TCP
 sockets

With a SOCKMAP/SOCKHASH map and an sk_msg program user can steer messages
sent from one TCP socket (s1) to actually egress from another TCP
socket (s2):

tcp_bpf_sendmsg(s1)		// = sk_prot->sendmsg
  tcp_bpf_send_verdict(s1)	// __SK_REDIRECT case
    tcp_bpf_sendmsg_redir(s2)
      tcp_bpf_push_locked(s2)
	tcp_bpf_push(s2)
	  tcp_rate_check_app_limited(s2) // expects tcp_sock
	  tcp_sendmsg_locked(s2)	 // ditto

There is a hard-coded assumption in the call-chain, that the egress
socket (s2) is a TCP socket.

However in commit 122e6c79efe1 ("sock_map: Update sock type checks for
UDP") we have enabled redirects to non-TCP sockets. This was done for the
sake of BPF sk_skb programs. There was no indention to support sk_msg
send-to-egress use case.

As a result, attempts to send-to-egress through a non-TCP socket lead to a
crash due to invalid downcast from sock to tcp_sock:

 BUG: kernel NULL pointer dereference, address: 000000000000002f
 ...
 Call Trace:
  <TASK>
  ? show_regs+0x60/0x70
  ? __die+0x1f/0x70
  ? page_fault_oops+0x80/0x160
  ? do_user_addr_fault+0x2d7/0x800
  ? rcu_is_watching+0x11/0x50
  ? exc_page_fault+0x70/0x1c0
  ? asm_exc_page_fault+0x27/0x30
  ? tcp_tso_segs+0x14/0xa0
  tcp_write_xmit+0x67/0xce0
  __tcp_push_pending_frames+0x32/0xf0
  tcp_push+0x107/0x140
  tcp_sendmsg_locked+0x99f/0xbb0
  tcp_bpf_push+0x19d/0x3a0
  tcp_bpf_sendmsg_redir+0x55/0xd0
  tcp_bpf_send_verdict+0x407/0x550
  tcp_bpf_sendmsg+0x1a1/0x390
  inet_sendmsg+0x6a/0x70
  sock_sendmsg+0x9d/0xc0
  ? sockfd_lookup_light+0x12/0x80
  __sys_sendto+0x10e/0x160
  ? syscall_enter_from_user_mode+0x20/0x60
  ? __this_cpu_preempt_check+0x13/0x20
  ? lockdep_hardirqs_on+0x82/0x110
  __x64_sys_sendto+0x1f/0x30
  do_syscall_64+0x38/0x90
  entry_SYSCALL_64_after_hwframe+0x63/0xcd

Reject selecting a non-TCP sockets as redirect target from a BPF sk_msg
program to prevent the crash. When attempted, user will receive an EACCES
error from send/sendto/sendmsg() syscall.

Fixes: 122e6c79efe1 ("sock_map: Update sock type checks for UDP")
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20230920102055.42662-1-jakub@cloudflare.com
---
 net/core/sock_map.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index cb11750b1df5f..4292c2ed18286 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -668,6 +668,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
 	sk = __sock_map_lookup_elem(map, key);
 	if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
 		return SK_DROP;
+	if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
+		return SK_DROP;
 
 	msg->flags = flags;
 	msg->sk_redir = sk;
@@ -1267,6 +1269,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
 	sk = __sock_hash_lookup_elem(map, key);
 	if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
 		return SK_DROP;
+	if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
+		return SK_DROP;
 
 	msg->flags = flags;
 	msg->sk_redir = sk;

From f9b0e1088bbf35933e25c839b75094039059b3be Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 29 Sep 2023 22:41:20 +0200
Subject: [PATCH 09/11] bpf, mprog: Fix maximum program check on mprog
 attachment

After Paul's recent improvement to syzkaller to improve coverage for
bpf_mprog and tcx, it hit a splat that the program limit was surpassed.
What happened is that the maximum number of progs got added, followed
by another prog add request which adds with BPF_F_BEFORE flag relative
to the last program in the array. The idx >= bpf_mprog_max() check in
bpf_mprog_attach() still passes because the index is below the maximum
but the maximum will be surpassed. We need to add a check upfront for
insertions to catch this situation.

Fixes: 053c8e1f235d ("bpf: Add generic attach/detach/query API for multi-progs")
Reported-by: syzbot+baa44e3dbbe48e05c1ad@syzkaller.appspotmail.com
Reported-by: syzbot+b97d20ed568ce0951a06@syzkaller.appspotmail.com
Reported-by: syzbot+2558ca3567a77b7af4e3@syzkaller.appspotmail.com
Co-developed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: syzbot+baa44e3dbbe48e05c1ad@syzkaller.appspotmail.com
Tested-by: syzbot+b97d20ed568ce0951a06@syzkaller.appspotmail.com
Link: https://github.com/google/syzkaller/pull/4207
Link: https://lore.kernel.org/bpf/20230929204121.20305-1-daniel@iogearbox.net
---
 kernel/bpf/mprog.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c
index 32d2c4829eb8d..007d98c799e2d 100644
--- a/kernel/bpf/mprog.c
+++ b/kernel/bpf/mprog.c
@@ -253,6 +253,9 @@ int bpf_mprog_attach(struct bpf_mprog_entry *entry,
 			goto out;
 		}
 		idx = tidx;
+	} else if (bpf_mprog_total(entry) == bpf_mprog_max()) {
+		ret = -ERANGE;
+		goto out;
 	}
 	if (flags & BPF_F_BEFORE) {
 		tidx = bpf_mprog_pos_before(entry, &rtuple);

From d1a783daa443d34e2f39811573ac9cbb5e5b78f3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 29 Sep 2023 22:41:21 +0200
Subject: [PATCH 10/11] selftest/bpf: Add various selftests for program limits

Add various tests to check maximum number of supported programs
being attached:

  # ./vmtest.sh -- ./test_progs -t tc_opts
  [...]
  ./test_progs -t tc_opts
  [    1.185325] bpf_testmod: loading out-of-tree module taints kernel.
  [    1.186826] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
  [    1.270123] tsc: Refined TSC clocksource calibration: 3407.988 MHz
  [    1.272428] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x311fc932722, max_idle_ns: 440795381586 ns
  [    1.276408] clocksource: Switched to clocksource tsc
  #252     tc_opts_after:OK
  #253     tc_opts_append:OK
  #254     tc_opts_basic:OK
  #255     tc_opts_before:OK
  #256     tc_opts_chain_classic:OK
  #257     tc_opts_chain_mixed:OK
  #258     tc_opts_delete_empty:OK
  #259     tc_opts_demixed:OK
  #260     tc_opts_detach:OK
  #261     tc_opts_detach_after:OK
  #262     tc_opts_detach_before:OK
  #263     tc_opts_dev_cleanup:OK
  #264     tc_opts_invalid:OK
  #265     tc_opts_max:OK              <--- (new test)
  #266     tc_opts_mixed:OK
  #267     tc_opts_prepend:OK
  #268     tc_opts_replace:OK
  #269     tc_opts_revision:OK
  Summary: 18/0 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230929204121.20305-2-daniel@iogearbox.net
---
 .../selftests/bpf/prog_tests/tc_opts.c        | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_opts.c b/tools/testing/selftests/bpf/prog_tests/tc_opts.c
index 7a2ecd4eca5dd..99af79ea21a9c 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_opts.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_opts.c
@@ -2378,3 +2378,87 @@ void serial_test_tc_opts_chain_mixed(void)
 	test_tc_chain_mixed(BPF_TCX_INGRESS);
 	test_tc_chain_mixed(BPF_TCX_EGRESS);
 }
+
+static int generate_dummy_prog(void)
+{
+	const struct bpf_insn prog_insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn);
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
+	const size_t log_buf_sz = 256;
+	char *log_buf;
+	int fd = -1;
+
+	log_buf = malloc(log_buf_sz);
+	if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc"))
+		return fd;
+	opts.log_buf = log_buf;
+	opts.log_size = log_buf_sz;
+
+	log_buf[0] = '\0';
+	opts.log_level = 0;
+	fd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, "tcx_prog", "GPL",
+			   prog_insns, prog_insn_cnt, &opts);
+	ASSERT_STREQ(log_buf, "", "log_0");
+	ASSERT_GE(fd, 0, "prog_fd");
+	free(log_buf);
+	return fd;
+}
+
+static void test_tc_opts_max_target(int target, int flags, bool relative)
+{
+	int err, ifindex, i, prog_fd, last_fd = -1;
+	LIBBPF_OPTS(bpf_prog_attach_opts, opta);
+	const int max_progs = 63;
+
+	ASSERT_OK(system("ip link add dev tcx_opts1 type veth peer name tcx_opts2"), "add veth");
+	ifindex = if_nametoindex("tcx_opts1");
+	ASSERT_NEQ(ifindex, 0, "non_zero_ifindex");
+
+	assert_mprog_count_ifindex(ifindex, target, 0);
+
+	for (i = 0; i < max_progs; i++) {
+		prog_fd = generate_dummy_prog();
+		if (!ASSERT_GE(prog_fd, 0, "dummy_prog"))
+			goto cleanup;
+		err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta);
+		if (!ASSERT_EQ(err, 0, "prog_attach"))
+			goto cleanup;
+		assert_mprog_count_ifindex(ifindex, target, i + 1);
+		if (i == max_progs - 1 && relative)
+			last_fd = prog_fd;
+		else
+			close(prog_fd);
+	}
+
+	prog_fd = generate_dummy_prog();
+	if (!ASSERT_GE(prog_fd, 0, "dummy_prog"))
+		goto cleanup;
+	opta.flags = flags;
+	if (last_fd > 0)
+		opta.relative_fd = last_fd;
+	err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta);
+	ASSERT_EQ(err, -ERANGE, "prog_64_attach");
+	assert_mprog_count_ifindex(ifindex, target, max_progs);
+	close(prog_fd);
+cleanup:
+	if (last_fd > 0)
+		close(last_fd);
+	ASSERT_OK(system("ip link del dev tcx_opts1"), "del veth");
+	ASSERT_EQ(if_nametoindex("tcx_opts1"), 0, "dev1_removed");
+	ASSERT_EQ(if_nametoindex("tcx_opts2"), 0, "dev2_removed");
+}
+
+void serial_test_tc_opts_max(void)
+{
+	test_tc_opts_max_target(BPF_TCX_INGRESS, 0, false);
+	test_tc_opts_max_target(BPF_TCX_EGRESS, 0, false);
+
+	test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_BEFORE, false);
+	test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_BEFORE, true);
+
+	test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_AFTER, true);
+	test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_AFTER, false);
+}

From 9077fc228f09c9f975c498c55f5d2e882cd0da59 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Thu, 28 Sep 2023 18:15:58 +0800
Subject: [PATCH 11/11] bpf: Use kmalloc_size_roundup() to adjust size_index

Commit d52b59315bf5 ("bpf: Adjust size_index according to the value of
KMALLOC_MIN_SIZE") uses KMALLOC_MIN_SIZE to adjust size_index, but as
reported by Nathan, the adjustment is not enough, because
__kmalloc_minalign() also decides the minimal alignment of slab object
as shown in new_kmalloc_cache() and its value may be greater than
KMALLOC_MIN_SIZE (e.g., 64 bytes vs 8 bytes under a riscv QEMU VM).

Instead of invoking __kmalloc_minalign() in bpf subsystem to find the
maximal alignment, just using kmalloc_size_roundup() directly to get the
corresponding slab object size for each allocation size. If these two
sizes are unmatched, adjust size_index to select a bpf_mem_cache with
unit_size equal to the object_size of the underlying slab cache for the
allocation size.

Fixes: 822fb26bdb55 ("bpf: Add a hint to allocated objects.")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/bpf/20230914181407.GA1000274@dev-arch.thelio-3990X/
Signed-off-by: Hou Tao <houtao1@huawei.com>
Tested-by: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Link: https://lore.kernel.org/r/20230928101558.2594068-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 44 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index cf19415166437..d93ddac283d40 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -965,37 +965,31 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
 
-/* Most of the logic is taken from setup_kmalloc_cache_index_table() */
 static __init int bpf_mem_cache_adjust_size(void)
 {
-	unsigned int size, index;
+	unsigned int size;
 
-	/* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be
-	 * up-to 256-bytes.
+	/* Adjusting the indexes in size_index() according to the object_size
+	 * of underlying slab cache, so bpf_mem_alloc() will select a
+	 * bpf_mem_cache with unit_size equal to the object_size of
+	 * the underlying slab cache.
+	 *
+	 * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is
+	 * 256-bytes, so only do adjustment for [8-bytes, 192-bytes].
 	 */
-	size = KMALLOC_MIN_SIZE;
-	if (size <= 192)
-		index = size_index[(size - 1) / 8];
-	else
-		index = fls(size - 1) - 1;
-	for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8)
-		size_index[(size - 1) / 8] = index;
+	for (size = 192; size >= 8; size -= 8) {
+		unsigned int kmalloc_size, index;
 
-	/* The minimal alignment is 64-bytes, so disable 96-bytes cache and
-	 * use 128-bytes cache instead.
-	 */
-	if (KMALLOC_MIN_SIZE >= 64) {
-		index = size_index[(128 - 1) / 8];
-		for (size = 64 + 8; size <= 96; size += 8)
-			size_index[(size - 1) / 8] = index;
-	}
+		kmalloc_size = kmalloc_size_roundup(size);
+		if (kmalloc_size == size)
+			continue;
 
-	/* The minimal alignment is 128-bytes, so disable 192-bytes cache and
-	 * use 256-bytes cache instead.
-	 */
-	if (KMALLOC_MIN_SIZE >= 128) {
-		index = fls(256 - 1) - 1;
-		for (size = 128 + 8; size <= 192; size += 8)
+		if (kmalloc_size <= 192)
+			index = size_index[(kmalloc_size - 1) / 8];
+		else
+			index = fls(kmalloc_size - 1) - 1;
+		/* Only overwrite if necessary */
+		if (size_index[(size - 1) / 8] != index)
 			size_index[(size - 1) / 8] = index;
 	}