From 2768b2e2f7d25ae8984ebdcde8ec1014b6fdcd89 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Apr 2025 15:00:03 +0300 Subject: [PATCH 1/3] net: enetc: register XDP RX queues with frag_size At the time when bpf_xdp_adjust_tail() gained support for non-linear buffers, ENETC was already generating this kind of geometry on RX, due to its use of 2K half page buffers. Frames larger than 1472 bytes (without FCS) are stored as multi-buffer, presenting a need for multi buffer support to work properly even in standard MTU circumstances. Allow bpf_xdp_frags_increase_tail() to know the allocation size of paged data, so it can safely permit growing the tailroom of the buffer from XDP programs. Fixes: bf25146a5595 ("bpf: add frags support to the bpf_xdp_adjust_tail() API") Signed-off-by: Vladimir Oltean Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250417120005.3288549-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 2106861463e4..9b333254c73e 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -3362,7 +3362,8 @@ static int enetc_int_vector_init(struct enetc_ndev_priv *priv, int i, bdr->buffer_offset = ENETC_RXB_PAD; priv->rx_ring[i] = bdr; - err = xdp_rxq_info_reg(&bdr->xdp.rxq, priv->ndev, i, 0); + err = __xdp_rxq_info_reg(&bdr->xdp.rxq, priv->ndev, i, 0, + ENETC_RXB_DMA_SIZE_XDP); if (err) goto free_vector; From 1d587faa5be7e9785b682cc5f58ba8f4100c13ea Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Apr 2025 15:00:04 +0300 Subject: [PATCH 2/3] net: enetc: refactor bulk flipping of RX buffers to separate function This small snippet of code ensures that we do something with the array of RX software buffer descriptor elements after passing the skb to the stack. In this case, we see if the other half of the page is reusable, and if so, we "turn around" the buffers, making them directly usable by enetc_refill_rx_ring() without going to enetc_new_page(). We will need to perform this kind of buffer flipping from a new code path, i.e. from XDP_PASS. Currently, enetc_build_skb() does it there buffer by buffer, but in a subsequent change we will stop using enetc_build_skb() for XDP_PASS. Signed-off-by: Vladimir Oltean Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250417120005.3288549-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 9b333254c73e..74721995cb1f 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1850,6 +1850,16 @@ static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, } } +static void enetc_bulk_flip_buff(struct enetc_bdr *rx_ring, int rx_ring_first, + int rx_ring_last) +{ + while (rx_ring_first != rx_ring_last) { + enetc_flip_rx_buff(rx_ring, + &rx_ring->rx_swbd[rx_ring_first]); + enetc_bdr_idx_inc(rx_ring, &rx_ring_first); + } +} + static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, struct napi_struct *napi, int work_limit, struct bpf_prog *prog) @@ -1965,11 +1975,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, enetc_xdp_drop(rx_ring, orig_i, i); rx_ring->stats.xdp_redirect_failures++; } else { - while (orig_i != i) { - enetc_flip_rx_buff(rx_ring, - &rx_ring->rx_swbd[orig_i]); - enetc_bdr_idx_inc(rx_ring, &orig_i); - } + enetc_bulk_flip_buff(rx_ring, orig_i, i); xdp_redirect_frm_cnt++; rx_ring->stats.xdp_redirect++; } From 020f0c8b3d396ec8190948f86063e1c45133f839 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Apr 2025 15:00:05 +0300 Subject: [PATCH 3/3] net: enetc: fix frame corruption on bpf_xdp_adjust_head/tail() and XDP_PASS Vlatko Markovikj reported that XDP programs attached to ENETC do not work well if they use bpf_xdp_adjust_head() or bpf_xdp_adjust_tail(), combined with the XDP_PASS verdict. A typical use case is to add or remove a VLAN tag. The resulting sk_buff passed to the stack is corrupted, because the algorithm used by the driver for XDP_PASS is to unwind the current buffer pointer in the RX ring and to re-process the current frame with enetc_build_skb() as if XDP hadn't run. That is incorrect because XDP may have modified the geometry of the buffer, which we then are completely unaware of. We are looking at a modified buffer with the original geometry. The initial reaction, both from me and from Vlatko, was to shop around the kernel for code to steal that would calculate a delta between the old and the new XDP buffer geometry, and apply that to the sk_buff too. We noticed that veth and generic xdp have such code. The headroom adjustment is pretty uncontroversial, but what turned out severely problematic is the tailroom. veth has this snippet: __skb_put(skb, off); /* positive on grow, negative on shrink */ which on first sight looks decent enough, except __skb_put() takes an "unsigned int" for the second argument, and the arithmetic seems to only work correctly by coincidence. Second issue, __skb_put() contains a SKB_LINEAR_ASSERT(). It's not a great pattern to make more widespread. The skb may still be nonlinear at that point - it only becomes linear later when resetting skb->data_len to zero. To avoid the above, bpf_prog_run_generic_xdp() does this instead: skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len += off; /* positive on grow, negative on shrink */ which is more open-coded, uses lower-level functions and is in general a bit too much to spread around in driver code. Then there is the snippet: if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; One would have expected __pskb_trim() to be the function of choice for this task. But it's not used in veth/xdpgeneric because the extraneous fragments were _already_ freed by bpf_xdp_adjust_tail() -> bpf_xdp_frags_shrink_tail() -> ... -> __xdp_return() - the backing memory for the skb frags and the xdp frags is the same, but they don't keep individual references. In fact, that is the biggest reason why this snippet cannot be reused as-is, because ENETC temporarily constructs an skb with the original len and the original number of frags. Because the extraneous frags are already freed by bpf_xdp_adjust_tail() and returned to the page allocator, it means the entire approach of using enetc_build_skb() is questionable for XDP_PASS. To avoid that, one would need to elevate the page refcount of all frags before calling bpf_prog_run_xdp() and drop it after XDP_PASS. There are other things that are missing in ENETC's handling of XDP_PASS, like for example updating skb_shinfo(skb)->meta_len. These are all handled correctly and cleanly in commit 539c1fba1ac7 ("xdp: add generic xdp_build_skb_from_buff()"), added to net-next in Dec 2024, and in addition might even be quicker that way. I have a very strong preference towards backporting that commit for "stable", and that is what is used to fix the handling bugs. It is way too messy to go this deep into the guts of an sk_buff from the code of a device driver. Fixes: d1b15102dd16 ("net: enetc: add support for XDP_DROP and XDP_PASS") Reported-by: Vlatko Markovikj Signed-off-by: Vladimir Oltean Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250417120005.3288549-4-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 26 +++++++++++--------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 74721995cb1f..3ee52f4b1166 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1878,11 +1878,10 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, while (likely(rx_frm_cnt < work_limit)) { union enetc_rx_bd *rxbd, *orig_rxbd; - int orig_i, orig_cleaned_cnt; struct xdp_buff xdp_buff; struct sk_buff *skb; + int orig_i, err; u32 bd_status; - int err; rxbd = enetc_rxbd(rx_ring, i); bd_status = le32_to_cpu(rxbd->r.lstatus); @@ -1897,7 +1896,6 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, break; orig_rxbd = rxbd; - orig_cleaned_cnt = cleaned_cnt; orig_i = i; enetc_build_xdp_buff(rx_ring, bd_status, &rxbd, &i, @@ -1925,15 +1923,21 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, rx_ring->stats.xdp_drops++; break; case XDP_PASS: - rxbd = orig_rxbd; - cleaned_cnt = orig_cleaned_cnt; - i = orig_i; - - skb = enetc_build_skb(rx_ring, bd_status, &rxbd, - &i, &cleaned_cnt, - ENETC_RXB_DMA_SIZE_XDP); - if (unlikely(!skb)) + skb = xdp_build_skb_from_buff(&xdp_buff); + /* Probably under memory pressure, stop NAPI */ + if (unlikely(!skb)) { + enetc_xdp_drop(rx_ring, orig_i, i); + rx_ring->stats.xdp_drops++; goto out; + } + + enetc_get_offloads(rx_ring, orig_rxbd, skb); + + /* These buffers are about to be owned by the stack. + * Update our buffer cache (the rx_swbd array elements) + * with their other page halves. + */ + enetc_bulk_flip_buff(rx_ring, orig_i, i); napi_gro_receive(napi, skb); break;