From 1fd6e5675336daf4747940b4285e84b0c114ae32 Mon Sep 17 00:00:00 2001
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Date: Tue, 5 Jul 2022 10:23:45 +0200
Subject: [PATCH 1/3] xdp: Fix spurious packet loss in generic XDP TX path

The byte queue limits (BQL) mechanism is intended to move queuing from
the driver to the network stack in order to reduce latency caused by
excessive queuing in hardware. However, when transmitting or redirecting
a packet using generic XDP, the qdisc layer is bypassed and there are no
additional queues. Since netif_xmit_stopped() also takes BQL limits into
account, but without having any alternative queuing, packets are
silently dropped.

This patch modifies the drop condition to only consider cases when the
driver itself cannot accept any more packets. This is analogous to the
condition in __dev_direct_xmit(). Dropped packets are also counted on
the device.

Bypassing the qdisc layer in the generic XDP TX path means that XDP
packets are able to starve other packets going through a qdisc, and
DDOS attacks will be more effective. In-driver-XDP use dedicated TX
queues, so they do not have this starvation issue.

Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220705082345.2494312-1-johan.almbladh@anyfinetworks.com
---
 net/core/dev.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 8e6f229612066..30a1603a7225c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4863,7 +4863,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 }
 
 /* When doing generic XDP we have to bypass the qdisc layer and the
- * network taps in order to match in-driver-XDP behavior.
+ * network taps in order to match in-driver-XDP behavior. This also means
+ * that XDP packets are able to starve other packets going through a qdisc,
+ * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
+ * queues, so they do not have this starvation issue.
  */
 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 {
@@ -4875,7 +4878,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 	txq = netdev_core_pick_tx(dev, skb, NULL);
 	cpu = smp_processor_id();
 	HARD_TX_LOCK(dev, txq, cpu);
-	if (!netif_xmit_stopped(txq)) {
+	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
 		rc = netdev_start_xmit(skb, dev, txq, 0);
 		if (dev_xmit_complete(rc))
 			free_skb = false;
@@ -4883,6 +4886,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 	HARD_TX_UNLOCK(dev, txq);
 	if (free_skb) {
 		trace_xdp_exception(dev, xdp_prog, XDP_TX);
+		dev_core_stats_tx_dropped_inc(dev);
 		kfree_skb(skb);
 	}
 }

From 0326195f523a549e0a9d7fd44c70b26fd7265090 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Jul 2022 12:39:00 +0000
Subject: [PATCH 2/3] bpf: Make sure mac_header was set before using it

Classic BPF has a way to load bytes starting from the mac header.

Some skbs do not have a mac header, and skb_mac_header()
in this case is returning a pointer that 65535 bytes after
skb->head.

Existing range check in bpf_internal_load_pointer_neg_helper()
was properly kicking and no illegal access was happening.

New sanity check in skb_mac_header() is firing, so we need
to avoid it.

WARNING: CPU: 1 PID: 28990 at include/linux/skbuff.h:2785 skb_mac_header include/linux/skbuff.h:2785 [inline]
WARNING: CPU: 1 PID: 28990 at include/linux/skbuff.h:2785 bpf_internal_load_pointer_neg_helper+0x1b1/0x1c0 kernel/bpf/core.c:74
Modules linked in:
CPU: 1 PID: 28990 Comm: syz-executor.0 Not tainted 5.19.0-rc4-syzkaller-00865-g4874fb9484be #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/29/2022
RIP: 0010:skb_mac_header include/linux/skbuff.h:2785 [inline]
RIP: 0010:bpf_internal_load_pointer_neg_helper+0x1b1/0x1c0 kernel/bpf/core.c:74
Code: ff ff 45 31 f6 e9 5a ff ff ff e8 aa 27 40 00 e9 3b ff ff ff e8 90 27 40 00 e9 df fe ff ff e8 86 27 40 00 eb 9e e8 2f 2c f3 ff <0f> 0b eb b1 e8 96 27 40 00 e9 79 fe ff ff 90 41 57 41 56 41 55 41
RSP: 0018:ffffc9000309f668 EFLAGS: 00010216
RAX: 0000000000000118 RBX: ffffffffffeff00c RCX: ffffc9000e417000
RDX: 0000000000040000 RSI: ffffffff81873f21 RDI: 0000000000000003
RBP: ffff8880842878c0 R08: 0000000000000003 R09: 000000000000ffff
R10: 000000000000ffff R11: 0000000000000001 R12: 0000000000000004
R13: ffff88803ac56c00 R14: 000000000000ffff R15: dffffc0000000000
FS: 00007f5c88a16700(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fdaa9f6c058 CR3: 000000003a82c000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
____bpf_skb_load_helper_32 net/core/filter.c:276 [inline]
bpf_skb_load_helper_32+0x191/0x220 net/core/filter.c:264

Fixes: f9aefd6b2aa3 ("net: warn if mac header was not set")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220707123900.945305-1-edumazet@google.com
---
 kernel/bpf/core.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5f6f3f829b368..e7961508a47d9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -68,11 +68,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
 {
 	u8 *ptr = NULL;
 
-	if (k >= SKF_NET_OFF)
+	if (k >= SKF_NET_OFF) {
 		ptr = skb_network_header(skb) + k - SKF_NET_OFF;
-	else if (k >= SKF_LL_OFF)
+	} else if (k >= SKF_LL_OFF) {
+		if (unlikely(!skb_mac_header_was_set(skb)))
+			return NULL;
 		ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
+	}
 	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
 		return ptr;
 

From f8d3da4ef8faf027261e06b7864583930dd7c7b9 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 6 Jul 2022 16:25:47 -0700
Subject: [PATCH 3/3] bpf: Add flags arg to bpf_dynptr_read and
 bpf_dynptr_write APIs

Commit 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write")
added the bpf_dynptr_write() and bpf_dynptr_read() APIs.

However, it will be needed for some dynptr types to pass in flags as
well (e.g. when writing to a skb, the user may like to invalidate the
hash or recompute the checksum).

This patch adds a "u64 flags" arg to the bpf_dynptr_read() and
bpf_dynptr_write() APIs before their UAPI signature freezes where
we then cannot change them anymore with a 5.19.x released kernel.

Fixes: 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write")
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20220706232547.4016651-1-joannelkoong@gmail.com
---
 include/uapi/linux/bpf.h                           | 11 +++++++----
 kernel/bpf/helpers.c                               | 12 ++++++++----
 tools/include/uapi/linux/bpf.h                     | 11 +++++++----
 tools/testing/selftests/bpf/progs/dynptr_fail.c    | 10 +++++-----
 tools/testing/selftests/bpf/progs/dynptr_success.c |  4 ++--
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4009dbdf62da..ef78e0e1a7549 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5222,22 +5222,25 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset)
+ * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
- *		of *src*'s data, -EINVAL if *src* is an invalid dynptr.
+ *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
+ *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len)
+ * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr.
+ *		is a read-only dynptr or if *flags* is not 0.
  *
  * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 225806a02efbe..bb1254f076672 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1497,11 +1497,12 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
 };
 
-BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset)
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+	   u32, offset, u64, flags)
 {
 	int err;
 
-	if (!src->data)
+	if (!src->data || flags)
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(src, offset, len);
@@ -1521,13 +1522,15 @@ const struct bpf_func_proto bpf_dynptr_read_proto = {
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_PTR_TO_DYNPTR,
 	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len)
+BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+	   u32, len, u64, flags)
 {
 	int err;
 
-	if (!dst->data || bpf_dynptr_is_rdonly(dst))
+	if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(dst, offset, len);
@@ -1547,6 +1550,7 @@ const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg5_type	= ARG_ANYTHING,
 };
 
 BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4009dbdf62da..ef78e0e1a7549 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5222,22 +5222,25 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset)
+ * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
- *		of *src*'s data, -EINVAL if *src* is an invalid dynptr.
+ *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
+ *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len)
+ * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr.
+ *		is a read-only dynptr or if *flags* is not 0.
  *
  * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index d811cff735972..0a26c243e6e9c 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -140,12 +140,12 @@ int use_after_invalid(void *ctx)
 
 	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(read_data), 0, &ptr);
 
-	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	bpf_ringbuf_submit_dynptr(&ptr, 0);
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	return 0;
 }
@@ -338,7 +338,7 @@ int invalid_helper2(void *ctx)
 	get_map_val_dynptr(&ptr);
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0, 0);
 
 	return 0;
 }
@@ -377,7 +377,7 @@ int invalid_write2(void *ctx)
 	memcpy((void *)&ptr + 8, &x, sizeof(x));
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	bpf_ringbuf_submit_dynptr(&ptr, 0);
 
@@ -473,7 +473,7 @@ int invalid_read2(void *ctx)
 	get_map_val_dynptr(&ptr);
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0, 0);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index d67be48df4b2b..a3a6103c85694 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -43,10 +43,10 @@ int test_read_write(void *ctx)
 	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr);
 
 	/* Write data into the dynptr */
-	err = err ?: bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data));
+	err = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
 
 	/* Read the data that was written into the dynptr */
-	err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	/* Ensure the data we read matches the data we wrote */
 	for (i = 0; i < sizeof(read_data); i++) {