From bc4db83470034b4644a8bf164a984bdb68b61622 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 10 Feb 2023 18:06:13 +0100 Subject: [PATCH 1/6] ice: fix ice_tx_ring:: Xdp_tx_active underflow xdp_tx_active is used to indicate whether an XDP ring has any %XDP_TX frames queued to shortcut processing Tx cleaning for XSk-enabled queues. When !XSk, it simply indicates whether the ring has any queued frames in general. It gets increased on each frame placed onto the ring and counts the whole frame, not each frag. However, currently it gets decremented in ice_clean_xdp_tx_buf(), which is called per each buffer, i.e. per each frag. Thus, on completing multi-frag frames, an underflow happens. Move the decrement to the outer function and do it once per frame, not buf. Also, do that on the stack and update the ring counter after the loop is done to save several cycles. XSk rings are fine since there are no frags at the moment. Fixes: 3246a10752a7 ("ice: Add support for XDP multi-buffer on Tx side") Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20230210170618.1973430-2-alexandr.lobakin@intel.com --- drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 9bbed3f14e427..d1a7171e618ba 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -231,7 +231,6 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf) dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); dma_unmap_len_set(tx_buf, len, 0); - xdp_ring->xdp_tx_active--; page_frag_free(tx_buf->raw_buf); tx_buf->raw_buf = NULL; } @@ -246,8 +245,8 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) u32 ntc = xdp_ring->next_to_clean; struct ice_tx_desc *tx_desc; u32 cnt = xdp_ring->count; + u32 frags, xdp_tx = 0; u32 ready_frames = 0; - u32 frags; u32 idx; u32 ret; @@ -274,6 +273,7 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) total_pkts++; /* count head + frags */ ready_frames -= frags + 1; + xdp_tx++; if (xdp_ring->xsk_pool) xsk_buff_free(tx_buf->xdp); @@ -295,6 +295,7 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) tx_desc->cmd_type_offset_bsz = 0; xdp_ring->next_to_clean = ntc; + xdp_ring->xdp_tx_active -= xdp_tx; ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes); return ret; From 0bd939b60ceaa6cf2894678f40b96f61b2501658 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 10 Feb 2023 18:06:14 +0100 Subject: [PATCH 2/6] ice: Fix XDP Tx ring overrun Sometimes, under heavy XDP Tx traffic, e.g. when using XDP traffic generator (%BPF_F_TEST_XDP_LIVE_FRAMES), the machine can catch OOM due to the driver not freeing all of the pages passed to it by .ndo_xdp_xmit(). Turned out that during the development of the tagged commit, the check, which ensures that we have a free descriptor to queue a frame, moved into the branch happening only when a buffer has frags. Otherwise, we only run a cleaning cycle, but don't check anything. ATST, there can be situations when the driver gets new frames to send, but there are no buffers that can be cleaned/completed and the ring has no free slots. It's very rare, but still possible (> 6.5 Mpps per ring). The driver then fills the next buffer/descriptor, effectively overwriting the data, which still needs to be freed. Restore the check after the cleaning routine to make sure there is a slot to queue a new frame. When there are frags, there still will be a separate check that we can place all of them, but if the ring is full, there's no point in wasting any more time. (minor: make `!ready_frames` unlikely since it happens ~1-2 times per billion of frames) Fixes: 3246a10752a7 ("ice: Add support for XDP multi-buffer on Tx side") Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20230210170618.1973430-3-alexandr.lobakin@intel.com --- drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index d1a7171e618ba..784f2f9ebb2d2 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -260,7 +260,7 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) ready_frames = idx + cnt - ntc + 1; } - if (!ready_frames) + if (unlikely(!ready_frames)) return 0; ret = ready_frames; @@ -322,17 +322,17 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) u32 frag = 0; free_space = ICE_DESC_UNUSED(xdp_ring); - - if (ICE_DESC_UNUSED(xdp_ring) < ICE_RING_QUARTER(xdp_ring)) + if (free_space < ICE_RING_QUARTER(xdp_ring)) free_space += ice_clean_xdp_irq(xdp_ring); + if (unlikely(!free_space)) + goto busy; + if (unlikely(xdp_buff_has_frags(xdp))) { sinfo = xdp_get_shared_info_from_buff(xdp); nr_frags = sinfo->nr_frags; - if (free_space < nr_frags + 1) { - xdp_ring->ring_stats->tx_stats.tx_busy++; - return ICE_XDP_CONSUMED; - } + if (free_space < nr_frags + 1) + goto busy; } tx_desc = ICE_TX_DESC(xdp_ring, ntu); @@ -396,6 +396,11 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) ntu--; } return ICE_XDP_CONSUMED; + +busy: + xdp_ring->ring_stats->tx_stats.tx_busy++; + + return ICE_XDP_CONSUMED; } /** From 923096b5cec3a68cc4b3816b1b9a50139df62ac7 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 10 Feb 2023 18:06:15 +0100 Subject: [PATCH 3/6] ice: Remove two impossible branches on XDP Tx cleaning The tagged commit started sending %XDP_TX frames from XSk Rx ring directly without converting it to an &xdp_frame. However, when XSk is enabled on a queue pair, it has its separate Tx cleaning functions, so neither ice_clean_xdp_irq() nor ice_unmap_and_free_tx_buf() ever happens there. Remove impossible branches in order to reduce the diffstat of the upcoming change. Fixes: a24b4c6e9aab ("ice: xsk: Do not convert to buff to frame for XDP_TX") Suggested-by: Maciej Fijalkowski Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20230210170618.1973430-4-alexandr.lobakin@intel.com --- drivers/net/ethernet/intel/ice/ice_txrx.c | 5 +---- drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 466113c86e6fd..6b99adb695e7e 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -116,10 +116,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf) if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) { devm_kfree(ring->dev, tx_buf->raw_buf); } else if (ice_ring_is_xdp(ring)) { - if (ring->xsk_pool) - xsk_buff_free(tx_buf->xdp); - else - page_frag_free(tx_buf->raw_buf); + page_frag_free(tx_buf->raw_buf); } else { dev_kfree_skb_any(tx_buf->skb); } diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 784f2f9ebb2d2..6371acb0deb08 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -275,10 +275,7 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) ready_frames -= frags + 1; xdp_tx++; - if (xdp_ring->xsk_pool) - xsk_buff_free(tx_buf->xdp); - else - ice_clean_xdp_tx_buf(xdp_ring, tx_buf); + ice_clean_xdp_tx_buf(xdp_ring, tx_buf); ntc++; if (ntc == cnt) ntc = 0; From aa1d3faf71a6a46f9b859daa8ffa5b86fa07217c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 10 Feb 2023 18:06:16 +0100 Subject: [PATCH 4/6] ice: Robustify cleaning/completing XDP Tx buffers When queueing frames from a Page Pool for redirecting to a device backed by the ice driver, `perf top` shows heavy load on page_alloc() and page_frag_free(), despite that on a properly working system it must be fully or at least almost zero-alloc. The problem is in fact a bit deeper and raises from how ice cleans up completed Tx buffers. The story so far: when cleaning/freeing the resources related to a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some heuristics only without setting any type explicitly (except for dummy Flow Director packets, which are marked via ice_tx_buf::tx_flags). This kinda works, but only up to some point. For example, currently ice assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by either plain order-0 page or plain page frag, while it may also be backed by Page Pool or any other possible memory models introduced in future. This means any &xdp_frame must be freed properly via xdp_return_frame() family with no assumptions. In order to do that, the whole heuristics must be replaced with setting the Tx buffer/frame type explicitly, just how it's always been done via an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't hurt much -- especially given that sometimes there was a check for %ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum member. The rest of the changes is straightforward and most of it is just a conversion to rely now on the type set in &ice_tx_buf rather than to some secondary properties. For now, no functional changes intended, the change only prepares the ground for starting freeing XDP frames properly next step. And it must be done atomically/synchronously to not break stuff. Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com --- drivers/net/ethernet/intel/ice/ice_txrx.c | 38 +++++++++---------- drivers/net/ethernet/intel/ice/ice_txrx.h | 34 ++++++++++++----- drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 15 ++++++-- drivers/net/ethernet/intel/ice/ice_xsk.c | 12 +++--- 4 files changed, 63 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 6b99adb695e7e..d7e8a3f81e20a 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -85,7 +85,7 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc, td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY | ICE_TX_DESC_CMD_RE; - tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT; + tx_buf->type = ICE_TX_BUF_DUMMY; tx_buf->raw_buf = raw_packet; tx_desc->cmd_type_offset_bsz = @@ -112,28 +112,26 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc, static void ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf) { - if (tx_buf->skb) { - if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) { - devm_kfree(ring->dev, tx_buf->raw_buf); - } else if (ice_ring_is_xdp(ring)) { - page_frag_free(tx_buf->raw_buf); - } else { - dev_kfree_skb_any(tx_buf->skb); - } - if (dma_unmap_len(tx_buf, len)) - dma_unmap_single(ring->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - } else if (dma_unmap_len(tx_buf, len)) { + if (dma_unmap_len(tx_buf, len)) dma_unmap_page(ring->dev, dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); + + switch (tx_buf->type) { + case ICE_TX_BUF_DUMMY: + devm_kfree(ring->dev, tx_buf->raw_buf); + break; + case ICE_TX_BUF_SKB: + dev_kfree_skb_any(tx_buf->skb); + break; + case ICE_TX_BUF_XDP_TX: + page_frag_free(tx_buf->raw_buf); + break; } tx_buf->next_to_watch = NULL; - tx_buf->skb = NULL; + tx_buf->type = ICE_TX_BUF_EMPTY; dma_unmap_len_set(tx_buf, len, 0); /* tx_buf must be completely set up in the transmit path */ } @@ -266,7 +264,7 @@ static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget) DMA_TO_DEVICE); /* clear tx_buf data */ - tx_buf->skb = NULL; + tx_buf->type = ICE_TX_BUF_EMPTY; dma_unmap_len_set(tx_buf, len, 0); /* unmap remaining buffers */ @@ -1709,6 +1707,7 @@ ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first, DMA_TO_DEVICE); tx_buf = &tx_ring->tx_buf[i]; + tx_buf->type = ICE_TX_BUF_FRAG; } /* record SW timestamp if HW timestamp is not available */ @@ -2352,6 +2351,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring) /* record the location of the first descriptor for this packet */ first = &tx_ring->tx_buf[tx_ring->next_to_use]; first->skb = skb; + first->type = ICE_TX_BUF_SKB; first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); first->gso_segs = 1; first->tx_flags = 0; @@ -2524,11 +2524,11 @@ void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring) dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); - if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) + if (tx_buf->type == ICE_TX_BUF_DUMMY) devm_kfree(tx_ring->dev, tx_buf->raw_buf); /* clear next_to_watch to prevent false hangs */ - tx_buf->raw_buf = NULL; + tx_buf->type = ICE_TX_BUF_EMPTY; tx_buf->tx_flags = 0; tx_buf->next_to_watch = NULL; dma_unmap_len_set(tx_buf, len, 0); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index efa3d378f19e2..18d8ba0396e89 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -121,10 +121,7 @@ static inline int ice_skb_pad(void) #define ICE_TX_FLAGS_TSO BIT(0) #define ICE_TX_FLAGS_HW_VLAN BIT(1) #define ICE_TX_FLAGS_SW_VLAN BIT(2) -/* ICE_TX_FLAGS_DUMMY_PKT is used to mark dummy packets that should be - * freed instead of returned like skb packets. - */ -#define ICE_TX_FLAGS_DUMMY_PKT BIT(3) +/* Free, was ICE_TX_FLAGS_DUMMY_PKT */ #define ICE_TX_FLAGS_TSYN BIT(4) #define ICE_TX_FLAGS_IPV4 BIT(5) #define ICE_TX_FLAGS_IPV6 BIT(6) @@ -149,22 +146,41 @@ static inline int ice_skb_pad(void) #define ICE_TXD_LAST_DESC_CMD (ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS) +/** + * enum ice_tx_buf_type - type of &ice_tx_buf to act on Tx completion + * @ICE_TX_BUF_EMPTY: unused OR XSk frame, no action required + * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree() + * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA + * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats + * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats + * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats + */ +enum ice_tx_buf_type { + ICE_TX_BUF_EMPTY = 0U, + ICE_TX_BUF_DUMMY, + ICE_TX_BUF_FRAG, + ICE_TX_BUF_SKB, + ICE_TX_BUF_XDP_TX, + ICE_TX_BUF_XSK_TX, +}; + struct ice_tx_buf { union { struct ice_tx_desc *next_to_watch; u32 rs_idx; }; union { - struct sk_buff *skb; - void *raw_buf; /* used for XDP */ - struct xdp_buff *xdp; /* used for XDP_TX ZC */ + void *raw_buf; /* used for XDP_TX and FDir rules */ + struct sk_buff *skb; /* used for .ndo_start_xmit() */ + struct xdp_buff *xdp; /* used for XDP_TX ZC */ }; unsigned int bytecount; union { unsigned int gso_segs; - unsigned int nr_frags; /* used for mbuf XDP */ + unsigned int nr_frags; /* used for mbuf XDP */ }; - u32 tx_flags; + u32 type:16; /* &ice_tx_buf_type */ + u32 tx_flags:16; DEFINE_DMA_UNMAP_LEN(len); DEFINE_DMA_UNMAP_ADDR(dma); }; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 6371acb0deb08..23ac4824e9747 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -231,8 +231,14 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf) dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); dma_unmap_len_set(tx_buf, len, 0); - page_frag_free(tx_buf->raw_buf); - tx_buf->raw_buf = NULL; + + switch (tx_buf->type) { + case ICE_TX_BUF_XDP_TX: + page_frag_free(tx_buf->raw_buf); + break; + } + + tx_buf->type = ICE_TX_BUF_EMPTY; } /** @@ -266,6 +272,7 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) while (ready_frames) { struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc]; + struct ice_tx_buf *head = tx_buf; /* bytecount holds size of head + frags */ total_bytes += tx_buf->bytecount; @@ -275,7 +282,6 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) ready_frames -= frags + 1; xdp_tx++; - ice_clean_xdp_tx_buf(xdp_ring, tx_buf); ntc++; if (ntc == cnt) ntc = 0; @@ -288,6 +294,8 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) if (ntc == cnt) ntc = 0; } + + ice_clean_xdp_tx_buf(xdp_ring, head); } tx_desc->cmd_type_offset_bsz = 0; @@ -349,6 +357,7 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) tx_desc->buf_addr = cpu_to_le64(dma); tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0); + tx_buf->type = ICE_TX_BUF_XDP_TX; tx_buf->raw_buf = data; ntu++; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index a25a68c69f222..917c75e530caa 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -631,7 +631,8 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) for (i = 0; i < xsk_frames; i++) { tx_buf = &xdp_ring->tx_buf[ntc]; - if (tx_buf->xdp) { + if (tx_buf->type == ICE_TX_BUF_XSK_TX) { + tx_buf->type = ICE_TX_BUF_EMPTY; xsk_buff_free(tx_buf->xdp); xdp_ring->xdp_tx_active--; } else { @@ -685,6 +686,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, tx_buf = &xdp_ring->tx_buf[ntu]; tx_buf->xdp = xdp; + tx_buf->type = ICE_TX_BUF_XSK_TX; tx_desc = ICE_TX_DESC(xdp_ring, ntu); tx_desc->buf_addr = cpu_to_le64(dma); tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP, @@ -1083,12 +1085,12 @@ void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring) while (ntc != ntu) { struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc]; - if (tx_buf->xdp) + if (tx_buf->type == ICE_TX_BUF_XSK_TX) { + tx_buf->type = ICE_TX_BUF_EMPTY; xsk_buff_free(tx_buf->xdp); - else + } else { xsk_frames++; - - tx_buf->raw_buf = NULL; + } ntc++; if (ntc >= xdp_ring->count) From 055d0920685e53ed8c2ad914888724d69dca17c6 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 10 Feb 2023 18:06:17 +0100 Subject: [PATCH 5/6] ice: Fix freeing XDP frames backed by Page Pool As already mentioned, freeing any &xdp_frame via page_frag_free() is wrong, as it assumes the frame is backed by either an order-0 page or a page with no "patrons" behind them, while in fact frames backed by Page Pool can be redirected to a device, which's driver doesn't use it. Keep storing a pointer to the raw buffer and then freeing it unconditionally via page_frag_free() for %XDP_TX frames, but introduce a separate type in the enum for frames coming through .ndo_xdp_xmit(), and free them via xdp_return_frame_bulk(). Note that saving xdpf as xdp_buff->data_hard_start is intentional and is always true when everything is configured properly. After this change, %XDP_REDIRECT from a Page Pool based driver to ice becomes zero-alloc as it should be and horrendous 3.3 Mpps / queue turn into 6.6, hehe. Let it go with no "Fixes:" tag as it spans across good 5+ commits and can't be trivially backported. Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20230210170618.1973430-6-alexandr.lobakin@intel.com --- drivers/net/ethernet/intel/ice/ice_txrx.c | 5 ++- drivers/net/ethernet/intel/ice/ice_txrx.h | 3 ++ drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 43 +++++++++++++++---- drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 3 +- 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index d7e8a3f81e20a..e451276a37b6e 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -128,6 +128,9 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf) case ICE_TX_BUF_XDP_TX: page_frag_free(tx_buf->raw_buf); break; + case ICE_TX_BUF_XDP_XMIT: + xdp_return_frame(tx_buf->xdpf); + break; } tx_buf->next_to_watch = NULL; @@ -575,7 +578,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, case XDP_TX: if (static_branch_unlikely(&ice_xdp_locking_key)) spin_lock(&xdp_ring->tx_lock); - ret = __ice_xmit_xdp_ring(xdp, xdp_ring); + ret = __ice_xmit_xdp_ring(xdp, xdp_ring, false); if (static_branch_unlikely(&ice_xdp_locking_key)) spin_unlock(&xdp_ring->tx_lock); if (ret == ICE_XDP_CONSUMED) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index 18d8ba0396e89..fff0efe28373a 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -153,6 +153,7 @@ static inline int ice_skb_pad(void) * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats + * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats */ enum ice_tx_buf_type { @@ -161,6 +162,7 @@ enum ice_tx_buf_type { ICE_TX_BUF_FRAG, ICE_TX_BUF_SKB, ICE_TX_BUF_XDP_TX, + ICE_TX_BUF_XDP_XMIT, ICE_TX_BUF_XSK_TX, }; @@ -172,6 +174,7 @@ struct ice_tx_buf { union { void *raw_buf; /* used for XDP_TX and FDir rules */ struct sk_buff *skb; /* used for .ndo_start_xmit() */ + struct xdp_frame *xdpf; /* used for .ndo_xdp_xmit() */ struct xdp_buff *xdp; /* used for XDP_TX ZC */ }; unsigned int bytecount; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 23ac4824e9747..6d98c34d99fc8 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -222,13 +222,15 @@ ice_receive_skb(struct ice_rx_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag) /** * ice_clean_xdp_tx_buf - Free and unmap XDP Tx buffer - * @xdp_ring: XDP Tx ring + * @dev: device for DMA mapping * @tx_buf: Tx buffer to clean + * @bq: XDP bulk flush struct */ static void -ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf) +ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf *tx_buf, + struct xdp_frame_bulk *bq) { - dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma), + dma_unmap_single(dev, dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); dma_unmap_len_set(tx_buf, len, 0); @@ -236,6 +238,9 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf) case ICE_TX_BUF_XDP_TX: page_frag_free(tx_buf->raw_buf); break; + case ICE_TX_BUF_XDP_XMIT: + xdp_return_frame_bulk(tx_buf->xdpf, bq); + break; } tx_buf->type = ICE_TX_BUF_EMPTY; @@ -248,9 +253,11 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf) static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) { int total_bytes = 0, total_pkts = 0; + struct device *dev = xdp_ring->dev; u32 ntc = xdp_ring->next_to_clean; struct ice_tx_desc *tx_desc; u32 cnt = xdp_ring->count; + struct xdp_frame_bulk bq; u32 frags, xdp_tx = 0; u32 ready_frames = 0; u32 idx; @@ -270,6 +277,9 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) return 0; ret = ready_frames; + xdp_frame_bulk_init(&bq); + rcu_read_lock(); /* xdp_return_frame_bulk() */ + while (ready_frames) { struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc]; struct ice_tx_buf *head = tx_buf; @@ -289,15 +299,18 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) for (int i = 0; i < frags; i++) { tx_buf = &xdp_ring->tx_buf[ntc]; - ice_clean_xdp_tx_buf(xdp_ring, tx_buf); + ice_clean_xdp_tx_buf(dev, tx_buf, &bq); ntc++; if (ntc == cnt) ntc = 0; } - ice_clean_xdp_tx_buf(xdp_ring, head); + ice_clean_xdp_tx_buf(dev, head, &bq); } + xdp_flush_frame_bulk(&bq); + rcu_read_unlock(); + tx_desc->cmd_type_offset_bsz = 0; xdp_ring->next_to_clean = ntc; xdp_ring->xdp_tx_active -= xdp_tx; @@ -310,8 +323,10 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) * __ice_xmit_xdp_ring - submit frame to XDP ring for transmission * @xdp: XDP buffer to be placed onto Tx descriptors * @xdp_ring: XDP ring for transmission + * @frame: whether this comes from .ndo_xdp_xmit() */ -int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) +int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring, + bool frame) { struct skb_shared_info *sinfo = NULL; u32 size = xdp->data_end - xdp->data; @@ -355,10 +370,15 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) dma_unmap_len_set(tx_buf, len, size); dma_unmap_addr_set(tx_buf, dma, dma); + if (frame) { + tx_buf->type = ICE_TX_BUF_FRAG; + } else { + tx_buf->type = ICE_TX_BUF_XDP_TX; + tx_buf->raw_buf = data; + } + tx_desc->buf_addr = cpu_to_le64(dma); tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0); - tx_buf->type = ICE_TX_BUF_XDP_TX; - tx_buf->raw_buf = data; ntu++; if (ntu == cnt) @@ -379,6 +399,11 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) tx_head->bytecount = xdp_get_buff_len(xdp); tx_head->nr_frags = nr_frags; + if (frame) { + tx_head->type = ICE_TX_BUF_XDP_XMIT; + tx_head->xdpf = xdp->data_hard_start; + } + /* update last descriptor from a frame with EOP */ tx_desc->cmd_type_offset_bsz |= cpu_to_le64(ICE_TX_DESC_CMD_EOP << ICE_TXD_QW1_CMD_S); @@ -419,7 +444,7 @@ int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring) struct xdp_buff xdp; xdp_convert_frame_to_buff(xdpf, &xdp); - return __ice_xmit_xdp_ring(&xdp, xdp_ring); + return __ice_xmit_xdp_ring(&xdp, xdp_ring, true); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h index ea977f283c22c..79efc20c46d9e 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h @@ -143,7 +143,8 @@ static inline u32 ice_set_rs_bit(const struct ice_tx_ring *xdp_ring) void ice_finalize_xdp_rx(struct ice_tx_ring *xdp_ring, unsigned int xdp_res, u32 first_idx); int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring); int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring); -int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring); +int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring, + bool frame); void ice_release_rx_desc(struct ice_rx_ring *rx_ring, u16 val); void ice_process_skb_fields(struct ice_rx_ring *rx_ring, From ad07f29b9c9a29eba04b19116c6db51387a638d7 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 10 Feb 2023 18:06:18 +0100 Subject: [PATCH 6/6] ice: Micro-optimize .ndo_xdp_xmit() path After the recent mbuf changes, ice_xmit_xdp_ring() became a 3-liner. It makes no sense to keep it global in a different file than its caller. Move it just next to the sole call site and mark static. Also, it doesn't need a full xdp_convert_frame_to_buff(). Save several cycles and fill only the fields used by __ice_xmit_xdp_ring() later on. Finally, since it doesn't modify @xdpf anyhow, mark the argument const to save some more (whole -11 bytes of .text! :D). Thanks to 1 jump less and less calcs as well, this yields as many as 6.7 Mpps per queue. `xdp.data_hard_start = xdpf` is fully intentional again (see xdp_convert_buff_to_frame()) and just works when there are no source device's driver issues. Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20230210170618.1973430-7-alexandr.lobakin@intel.com --- drivers/net/ethernet/intel/ice/ice_txrx.c | 21 ++++++++++++++++++- drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 13 ------------ drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 1 - 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index e451276a37b6e..aaf313a953684 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -605,6 +605,25 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, ice_set_rx_bufs_act(xdp, rx_ring, ret); } +/** + * ice_xmit_xdp_ring - submit frame to XDP ring for transmission + * @xdpf: XDP frame that will be converted to XDP buff + * @xdp_ring: XDP ring for transmission + */ +static int ice_xmit_xdp_ring(const struct xdp_frame *xdpf, + struct ice_tx_ring *xdp_ring) +{ + struct xdp_buff xdp; + + xdp.data_hard_start = (void *)xdpf; + xdp.data = xdpf->data; + xdp.data_end = xdp.data + xdpf->len; + xdp.frame_sz = xdpf->frame_sz; + xdp.flags = xdpf->flags; + + return __ice_xmit_xdp_ring(&xdp, xdp_ring, true); +} + /** * ice_xdp_xmit - submit packets to XDP ring for transmission * @dev: netdev @@ -650,7 +669,7 @@ ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, tx_buf = &xdp_ring->tx_buf[xdp_ring->next_to_use]; for (i = 0; i < n; i++) { - struct xdp_frame *xdpf = frames[i]; + const struct xdp_frame *xdpf = frames[i]; int err; err = ice_xmit_xdp_ring(xdpf, xdp_ring); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 6d98c34d99fc8..7bc5aa340c7df 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -434,19 +434,6 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring, return ICE_XDP_CONSUMED; } -/** - * ice_xmit_xdp_ring - submit frame to XDP ring for transmission - * @xdpf: XDP frame that will be converted to XDP buff - * @xdp_ring: XDP ring for transmission - */ -int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring) -{ - struct xdp_buff xdp; - - xdp_convert_frame_to_buff(xdpf, &xdp); - return __ice_xmit_xdp_ring(&xdp, xdp_ring, true); -} - /** * ice_finalize_xdp_rx - Bump XDP Tx tail and/or flush redirect map * @xdp_ring: XDP ring diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h index 79efc20c46d9e..115969ecdf7b9 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h @@ -142,7 +142,6 @@ static inline u32 ice_set_rs_bit(const struct ice_tx_ring *xdp_ring) void ice_finalize_xdp_rx(struct ice_tx_ring *xdp_ring, unsigned int xdp_res, u32 first_idx); int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring); -int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring); int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring, bool frame); void ice_release_rx_desc(struct ice_rx_ring *rx_ring, u16 val);