From e9e49ae88ec86c370aae56700c9ff7421dcf91fe Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:16 +0300 Subject: [PATCH 01/10] net: enetc: remove redundant clearing of skb/xdp_frame pointer in TX conf path Later in enetc_clean_tx_ring we have: /* Scrub the swbd here so we don't have to do that * when we reuse it during xmit */ memset(tx_swbd, 0, sizeof(*tx_swbd)); So these assignments are unnecessary. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 9a726085841d2..c7f3c6e691a17 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -544,7 +544,6 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) if (xdp_frame) { xdp_return_frame(xdp_frame); - tx_swbd->xdp_frame = NULL; } else if (skb) { if (unlikely(tx_swbd->skb->cb[0] & ENETC_F_TX_ONESTEP_SYNC_TSTAMP)) { @@ -558,7 +557,6 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) do_twostep_tstamp = false; } napi_consume_skb(skb, napi_budget); - tx_swbd->skb = NULL; } tx_byte_cnt += tx_swbd->len; From 6b04830d5e0d7cbe137310527e9ad114686edef7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:17 +0300 Subject: [PATCH 02/10] net: enetc: rename the buffer reuse helpers enetc_put_xdp_buff has nothing to do with XDP, frankly, it is just a helper to populate the recycle end of the shadow RX BD ring (next_to_alloc) with a given buffer. On the other hand, enetc_put_rx_buff plays more tricks than its name would suggest. So let's rename enetc_put_rx_buff into enetc_flip_rx_buff to reflect the half-page buffer reuse tricks that it employs, and enetc_put_xdp_buff into enetc_put_rx_buff which suggests a more garden-variety operation. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 54 +++++++++----------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index c7f3c6e691a17..c4ff090f29ec6 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -751,27 +751,35 @@ static struct enetc_rx_swbd *enetc_get_rx_buff(struct enetc_bdr *rx_ring, return rx_swbd; } +/* Reuse the current page without performing half-page buffer flipping */ static void enetc_put_rx_buff(struct enetc_bdr *rx_ring, struct enetc_rx_swbd *rx_swbd) { - if (likely(enetc_page_reusable(rx_swbd->page))) { - size_t buffer_size = ENETC_RXB_TRUESIZE - rx_ring->buffer_offset; + size_t buffer_size = ENETC_RXB_TRUESIZE - rx_ring->buffer_offset; + + enetc_reuse_page(rx_ring, rx_swbd); + dma_sync_single_range_for_device(rx_ring->dev, rx_swbd->dma, + rx_swbd->page_offset, + buffer_size, rx_swbd->dir); + + rx_swbd->page = NULL; +} + +/* Reuse the current page by performing half-page buffer flipping */ +static void enetc_flip_rx_buff(struct enetc_bdr *rx_ring, + struct enetc_rx_swbd *rx_swbd) +{ + if (likely(enetc_page_reusable(rx_swbd->page))) { rx_swbd->page_offset ^= ENETC_RXB_TRUESIZE; page_ref_inc(rx_swbd->page); - enetc_reuse_page(rx_ring, rx_swbd); - - /* sync for use by the device */ - dma_sync_single_range_for_device(rx_ring->dev, rx_swbd->dma, - rx_swbd->page_offset, - buffer_size, rx_swbd->dir); + enetc_put_rx_buff(rx_ring, rx_swbd); } else { dma_unmap_page(rx_ring->dev, rx_swbd->dma, PAGE_SIZE, rx_swbd->dir); + rx_swbd->page = NULL; } - - rx_swbd->page = NULL; } static struct sk_buff *enetc_map_rx_buff_to_skb(struct enetc_bdr *rx_ring, @@ -791,7 +799,7 @@ static struct sk_buff *enetc_map_rx_buff_to_skb(struct enetc_bdr *rx_ring, skb_reserve(skb, rx_ring->buffer_offset); __skb_put(skb, size); - enetc_put_rx_buff(rx_ring, rx_swbd); + enetc_flip_rx_buff(rx_ring, rx_swbd); return skb; } @@ -804,7 +812,7 @@ static void enetc_add_rx_buff_to_skb(struct enetc_bdr *rx_ring, int i, skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_swbd->page, rx_swbd->page_offset, size, ENETC_RXB_TRUESIZE); - enetc_put_rx_buff(rx_ring, rx_swbd); + enetc_flip_rx_buff(rx_ring, rx_swbd); } static bool enetc_check_bd_errors_and_consume(struct enetc_bdr *rx_ring, @@ -1142,20 +1150,6 @@ static void enetc_build_xdp_buff(struct enetc_bdr *rx_ring, u32 bd_status, } } -/* Reuse the current page without performing half-page buffer flipping */ -static void enetc_put_xdp_buff(struct enetc_bdr *rx_ring, - struct enetc_rx_swbd *rx_swbd) -{ - enetc_reuse_page(rx_ring, rx_swbd); - - dma_sync_single_range_for_device(rx_ring->dev, rx_swbd->dma, - rx_swbd->page_offset, - ENETC_RXB_DMA_SIZE_XDP, - rx_swbd->dir); - - rx_swbd->page = NULL; -} - /* Convert RX buffer descriptors to TX buffer descriptors. These will be * recycled back into the RX ring in enetc_clean_tx_ring. We need to scrub the * RX software BDs because the ownership of the buffer no longer belongs to the @@ -1194,8 +1188,8 @@ static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, int rx_ring_last) { while (rx_ring_first != rx_ring_last) { - enetc_put_xdp_buff(rx_ring, - &rx_ring->rx_swbd[rx_ring_first]); + enetc_put_rx_buff(rx_ring, + &rx_ring->rx_swbd[rx_ring_first]); enetc_bdr_idx_inc(rx_ring, &rx_ring_first); } rx_ring->stats.xdp_drops++; @@ -1316,8 +1310,8 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, tmp_orig_i = orig_i; while (orig_i != i) { - enetc_put_rx_buff(rx_ring, - &rx_ring->rx_swbd[orig_i]); + enetc_flip_rx_buff(rx_ring, + &rx_ring->rx_swbd[orig_i]); enetc_bdr_idx_inc(rx_ring, &orig_i); } From 672f9a21989e56c8233d1d8daab3e5eecf76c59e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:18 +0300 Subject: [PATCH 03/10] net: enetc: recycle buffers for frames with RX errors When receiving a frame with errors, currently we do nothing with it (we don't construct an skb or an xdp_buff), we just exit the NAPI poll loop. Let's put the buffer back into the RX ring (similar to XDP_DROP). Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index c4ff090f29ec6..c6f9844733375 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -822,12 +822,14 @@ static bool enetc_check_bd_errors_and_consume(struct enetc_bdr *rx_ring, if (likely(!(bd_status & ENETC_RXBD_LSTATUS(ENETC_RXBD_ERR_MASK)))) return false; + enetc_put_rx_buff(rx_ring, &rx_ring->rx_swbd[*i]); enetc_rxbd_next(rx_ring, rxbd, i); while (!(bd_status & ENETC_RXBD_LSTATUS_F)) { dma_rmb(); bd_status = le32_to_cpu((*rxbd)->r.lstatus); + enetc_put_rx_buff(rx_ring, &rx_ring->rx_swbd[*i]); enetc_rxbd_next(rx_ring, rxbd, i); } From 8f50d8bb3f1c173492d1d224bce99486fd6ccd32 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:19 +0300 Subject: [PATCH 04/10] net: enetc: stop XDP NAPI processing when build_skb() fails When the code path below fails: enetc_clean_rx_ring_xdp // XDP_PASS -> enetc_build_skb -> enetc_map_rx_buff_to_skb -> build_skb enetc_clean_rx_ring_xdp will 'break', but that 'break' instruction isn't strong enough to actually break the NAPI poll loop, just the switch/case statement for XDP actions. So we increment rx_frm_cnt and go to the next frames minding our own business. Instead let's do what the skb NAPI poll function does, and break the loop now, waiting for the memory pressure to go away. Otherwise the next calls to build_skb() are likely to fail too. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index c6f9844733375..469170076efa1 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1275,8 +1275,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, &i, &cleaned_cnt, ENETC_RXB_DMA_SIZE_XDP); if (unlikely(!skb)) - /* Exit the switch/case, not the loop */ - break; + goto out; napi_gro_receive(napi, skb); break; @@ -1338,6 +1337,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, rx_frm_cnt++; } +out: rx_ring->next_to_clean = i; rx_ring->stats.packets += rx_frm_cnt; From a6369fe6e07d7e45aa5e73eccc6d426e92525e5c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:20 +0300 Subject: [PATCH 05/10] net: enetc: remove unneeded xdp_do_flush_map() xdp_do_redirect already contains: -> dev_map_enqueue -> __xdp_enqueue -> bq_enqueue -> bq_xmit_all // if we have more than 16 frames So the logic from enetc will never be hit, because ENETC_DEFAULT_TX_WORK is 128. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 469170076efa1..c7b940979314a 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1324,11 +1324,6 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, rx_ring->stats.xdp_redirect++; } - if (unlikely(xdp_redirect_frm_cnt > ENETC_DEFAULT_TX_WORK)) { - xdp_do_flush_map(); - xdp_redirect_frm_cnt = 0; - } - break; default: bpf_warn_invalid_xdp_action(xdp_act); From ee3e875f10fca68fb7478c23c75b553e56da319c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:21 +0300 Subject: [PATCH 06/10] net: enetc: increase TX ring size Now that commit d6a2829e82cf ("net: enetc: increase RX ring default size") has increased the RX ring size, it is quite easy to congest the TX rings when the traffic is predominantly XDP_TX, as the RX ring is quite a bit larger than the TX one. Since we bit the bullet and did the expensive thing already (larger RX rings consume more memory pages), it seems quite foolish to keep the TX rings small. So make them equally sized with TX. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index d52717bc73c79..6f818e33e03b2 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -79,7 +79,7 @@ struct enetc_xdp_data { }; #define ENETC_RX_RING_DEFAULT_SIZE 2048 -#define ENETC_TX_RING_DEFAULT_SIZE 256 +#define ENETC_TX_RING_DEFAULT_SIZE 2048 #define ENETC_DEFAULT_TX_WORK (ENETC_TX_RING_DEFAULT_SIZE / 2) struct enetc_bdr { From 7eab503b11ee1c4bb4a28866a6b029b0bbbeadfe Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:22 +0300 Subject: [PATCH 07/10] net: enetc: use dedicated TX rings for XDP It is possible for one CPU to perform TX hashing (see netdev_pick_tx) between the 8 ENETC TX rings, and the TX hashing to select TX queue 1. At the same time, it is possible for the other CPU to already use TX ring 1 for XDP (either XDP_TX or XDP_REDIRECT). Since there is no mutual exclusion between XDP and the network stack, we run into an issue because the ENETC TX procedure is not reentrant. The obvious approach would be to just make XDP take the lock of the network stack's TX queue corresponding to the ring it's about to enqueue in. For XDP_REDIRECT, this is quite straightforward, a lock at the beginning and end of enetc_xdp_xmit() should do the trick. But for XDP_TX, it's a bit more complicated. For one, we do TX batching all by ourselves for frames with the XDP_TX verdict. This is something we would like to keep the way it is, for performance reasons. But batching means that the network stack's lock should be kept from the first enqueued XDP_TX frame and until we ring the doorbell. That is mostly fine, except for cases when in the same NAPI loop we have mixed XDP_TX and XDP_REDIRECT frames. So if enetc_xdp_xmit() gets called while we are holding the lock from the RX NAPI, then bam, deadlock. The naive answer could be 'just flush the XDP_TX frames first, then release the network stack's TX queue lock, then call xdp_do_flush_map()'. But even xdp_do_redirect() is capable of flushing the batched XDP_REDIRECT frames, so unless we unlock/relock the TX queue around xdp_do_redirect(), there simply isn't any clean way to protect XDP_TX from concurrent network stack .ndo_start_xmit() on another CPU. So we need to take a different approach, and that is to reserve two rings for the sole use of XDP. We leave TX rings 0..ndev->real_num_tx_queues-1 to be handled by the network stack, and we pick them from the end of the priv->tx_ring array. We make an effort to keep the mapping done by enetc_alloc_msix() which decides which CPU handles the TX completions of which TX ring in its NAPI poll. So the XDP TX ring of CPU 0 is handled by TX ring 6, and the XDP TX ring of CPU 1 is handled by TX ring 7. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 46 +++++++++++++++++--- drivers/net/ethernet/freescale/enetc/enetc.h | 1 + 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index c7b940979314a..56190d861bb95 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -9,6 +9,26 @@ #include #include +static int enetc_num_stack_tx_queues(struct enetc_ndev_priv *priv) +{ + int num_tx_rings = priv->num_tx_rings; + int i; + + for (i = 0; i < priv->num_rx_rings; i++) + if (priv->rx_ring[i]->xdp.prog) + return num_tx_rings - num_possible_cpus(); + + return num_tx_rings; +} + +static struct enetc_bdr *enetc_rx_ring_from_xdp_tx_ring(struct enetc_ndev_priv *priv, + struct enetc_bdr *tx_ring) +{ + int index = &priv->tx_ring[tx_ring->index] - priv->xdp_tx_ring; + + return priv->rx_ring[index]; +} + static struct sk_buff *enetc_tx_swbd_get_skb(struct enetc_tx_swbd *tx_swbd) { if (tx_swbd->is_xdp_tx || tx_swbd->is_xdp_redirect) @@ -468,7 +488,6 @@ static void enetc_recycle_xdp_tx_buff(struct enetc_bdr *tx_ring, struct enetc_tx_swbd *tx_swbd) { struct enetc_ndev_priv *priv = netdev_priv(tx_ring->ndev); - struct enetc_bdr *rx_ring = priv->rx_ring[tx_ring->index]; struct enetc_rx_swbd rx_swbd = { .dma = tx_swbd->dma, .page = tx_swbd->page, @@ -476,6 +495,9 @@ static void enetc_recycle_xdp_tx_buff(struct enetc_bdr *tx_ring, .dir = tx_swbd->dir, .len = tx_swbd->len, }; + struct enetc_bdr *rx_ring; + + rx_ring = enetc_rx_ring_from_xdp_tx_ring(priv, tx_ring); if (likely(enetc_swbd_unused(rx_ring))) { enetc_reuse_page(rx_ring, &rx_swbd); @@ -1059,7 +1081,7 @@ int enetc_xdp_xmit(struct net_device *ndev, int num_frames, int xdp_tx_bd_cnt, i, k; int xdp_tx_frm_cnt = 0; - tx_ring = priv->tx_ring[smp_processor_id()]; + tx_ring = priv->xdp_tx_ring[smp_processor_id()]; prefetchw(ENETC_TXBD(*tx_ring, tx_ring->next_to_use)); @@ -1221,8 +1243,8 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, int xdp_tx_bd_cnt, xdp_tx_frm_cnt = 0, xdp_redirect_frm_cnt = 0; struct enetc_tx_swbd xdp_tx_arr[ENETC_MAX_SKB_FRAGS] = {0}; struct enetc_ndev_priv *priv = netdev_priv(rx_ring->ndev); - struct enetc_bdr *tx_ring = priv->tx_ring[rx_ring->index]; int rx_frm_cnt = 0, rx_byte_cnt = 0; + struct enetc_bdr *tx_ring; int cleaned_cnt, i; u32 xdp_act; @@ -1280,6 +1302,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, napi_gro_receive(napi, skb); break; case XDP_TX: + tx_ring = priv->xdp_tx_ring[rx_ring->index]; xdp_tx_bd_cnt = enetc_rx_swbd_to_xdp_tx_swbd(xdp_tx_arr, rx_ring, orig_i, i); @@ -2022,6 +2045,7 @@ void enetc_start(struct net_device *ndev) int enetc_open(struct net_device *ndev) { struct enetc_ndev_priv *priv = netdev_priv(ndev); + int num_stack_tx_queues; int err; err = enetc_setup_irqs(priv); @@ -2040,7 +2064,9 @@ int enetc_open(struct net_device *ndev) if (err) goto err_alloc_rx; - err = netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); + num_stack_tx_queues = enetc_num_stack_tx_queues(priv); + + err = netif_set_real_num_tx_queues(ndev, num_stack_tx_queues); if (err) goto err_set_queues; @@ -2113,15 +2139,17 @@ static int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data) struct enetc_ndev_priv *priv = netdev_priv(ndev); struct tc_mqprio_qopt *mqprio = type_data; struct enetc_bdr *tx_ring; + int num_stack_tx_queues; u8 num_tc; int i; + num_stack_tx_queues = enetc_num_stack_tx_queues(priv); mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS; num_tc = mqprio->num_tc; if (!num_tc) { netdev_reset_tc(ndev); - netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); + netif_set_real_num_tx_queues(ndev, num_stack_tx_queues); /* Reset all ring priorities to 0 */ for (i = 0; i < priv->num_tx_rings; i++) { @@ -2133,7 +2161,7 @@ static int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data) } /* Check if we have enough BD rings available to accommodate all TCs */ - if (num_tc > priv->num_tx_rings) { + if (num_tc > num_stack_tx_queues) { netdev_err(ndev, "Max %d traffic classes supported\n", priv->num_tx_rings); return -EINVAL; @@ -2421,8 +2449,9 @@ int enetc_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd) int enetc_alloc_msix(struct enetc_ndev_priv *priv) { struct pci_dev *pdev = priv->si->pdev; - int v_tx_rings; + int first_xdp_tx_ring; int i, n, err, nvec; + int v_tx_rings; nvec = ENETC_BDR_INT_BASE_IDX + priv->bdr_int_num; /* allocate MSIX for both messaging and Rx/Tx interrupts */ @@ -2497,6 +2526,9 @@ int enetc_alloc_msix(struct enetc_ndev_priv *priv) } } + first_xdp_tx_ring = priv->num_tx_rings - num_possible_cpus(); + priv->xdp_tx_ring = &priv->tx_ring[first_xdp_tx_ring]; + return 0; fail: diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 6f818e33e03b2..3de71669e317e 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -317,6 +317,7 @@ struct enetc_ndev_priv { u32 speed; /* store speed for compare update pspeed */ + struct enetc_bdr **xdp_tx_ring; struct enetc_bdr *tx_ring[16]; struct enetc_bdr *rx_ring[16]; From 975acc833c9f34d784075815a8374760d1c6358b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:23 +0300 Subject: [PATCH 08/10] net: enetc: handle the invalid XDP action the same way as XDP_DROP When the XDP program returns an invalid action, we should free the RX buffer. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 56190d861bb95..0b84d4a748898 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1282,6 +1282,9 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, xdp_act = bpf_prog_run_xdp(prog, &xdp_buff); switch (xdp_act) { + default: + bpf_warn_invalid_xdp_action(xdp_act); + fallthrough; case XDP_ABORTED: trace_xdp_exception(rx_ring->ndev, prog, xdp_act); fallthrough; @@ -1346,10 +1349,6 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, xdp_redirect_frm_cnt++; rx_ring->stats.xdp_redirect++; } - - break; - default: - bpf_warn_invalid_xdp_action(xdp_act); } rx_frm_cnt++; From 92ff9a6e578dc32950567efaf987328c32fefdc6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:24 +0300 Subject: [PATCH 09/10] net: enetc: fix buffer leaks with XDP_TX enqueue rejections If the TX ring is congested, enetc_xdp_tx() returns false for the current XDP frame (represented as an array of software BDs). This array of software TX BDs is constructed in enetc_rx_swbd_to_xdp_tx_swbd from software BDs freshly cleaned from the RX ring. The issue is that we scrub the RX software BDs too soon, more precisely before we know that we can enqueue the TX BDs successfully into the TX ring. If we can't enqueue them (and enetc_xdp_tx returns false), we call enetc_xdp_drop which attempts to recycle the buffers held by the RX software BDs. But because we scrubbed those RX BDs already, two things happen: (a) we leak their memory (b) we populate the RX software BD ring with an all-zero rx_swbd structure, which makes the buffer refill path allocate more memory. enetc_refill_rx_ring -> if (unlikely(!rx_swbd->page)) -> enetc_new_page That is a recipe for fast OOM. Fixes: 7ed2bc80074e ("net: enetc: add support for XDP_TX") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 0b84d4a748898..f0ba612d5ce33 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1175,9 +1175,7 @@ static void enetc_build_xdp_buff(struct enetc_bdr *rx_ring, u32 bd_status, } /* Convert RX buffer descriptors to TX buffer descriptors. These will be - * recycled back into the RX ring in enetc_clean_tx_ring. We need to scrub the - * RX software BDs because the ownership of the buffer no longer belongs to the - * RX ring, so enetc_refill_rx_ring may not reuse rx_swbd->page. + * recycled back into the RX ring in enetc_clean_tx_ring. */ static int enetc_rx_swbd_to_xdp_tx_swbd(struct enetc_tx_swbd *xdp_tx_arr, struct enetc_bdr *rx_ring, @@ -1199,7 +1197,6 @@ static int enetc_rx_swbd_to_xdp_tx_swbd(struct enetc_tx_swbd *xdp_tx_arr, tx_swbd->is_dma_page = true; tx_swbd->is_xdp_tx = true; tx_swbd->is_eof = false; - memset(rx_swbd, 0, sizeof(*rx_swbd)); } /* We rely on caller providing an rx_ring_last > rx_ring_first */ @@ -1317,6 +1314,17 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, tx_ring->stats.xdp_tx += xdp_tx_bd_cnt; rx_ring->xdp.xdp_tx_in_flight += xdp_tx_bd_cnt; xdp_tx_frm_cnt++; + /* The XDP_TX enqueue was successful, so we + * need to scrub the RX software BDs because + * the ownership of the buffers no longer + * belongs to the RX ring, and we must prevent + * enetc_refill_rx_ring() from reusing + * rx_swbd->page. + */ + while (orig_i != i) { + rx_ring->rx_swbd[orig_i].page = NULL; + enetc_bdr_idx_inc(rx_ring, &orig_i); + } } break; case XDP_REDIRECT: From 24e393097171719e3a64abb8f4cc7bf8fbce2ac4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Apr 2021 00:22:25 +0300 Subject: [PATCH 10/10] net: enetc: apply the MDIO workaround for XDP_REDIRECT too Described in fd5736bf9f23 ("enetc: Workaround for MDIO register access issue") is a workaround for a hardware bug that requires a register access of the MDIO controller to never happen concurrently with a register access of a port PF. To avoid that, a mutual exclusion scheme with rwlocks was implemented - the port PF accessors are the 'read' side, and the MDIO accessors are the 'write' side. When we do XDP_REDIRECT between two ENETC interfaces, all is fine because the MDIO lock is already taken from the NAPI poll loop. But when the ingress interface is not ENETC, just the egress is, the MDIO lock is not taken, so we might access the port PF registers concurrently with MDIO, which will make the link flap due to wrong values returned from the PHY. To avoid this, let's just slap an enetc_lock_mdio/enetc_unlock_mdio at the beginning and ending of enetc_xdp_xmit. The fact that the MDIO lock is designed as a rwlock is important here, because the read side is reentrant (that is one of the main reasons why we chose it). Usually, the way we benefit of its reentrancy is by running the data path concurrently on both CPUs, but in this case, we benefit from the reentrancy by taking the lock even when the lock is already taken (and that's the situation where ENETC is both the ingress and the egress interface for XDP_REDIRECT, which was fine before and still is fine now). Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index f0ba612d5ce33..4f23829e7317a 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1081,6 +1081,8 @@ int enetc_xdp_xmit(struct net_device *ndev, int num_frames, int xdp_tx_bd_cnt, i, k; int xdp_tx_frm_cnt = 0; + enetc_lock_mdio(); + tx_ring = priv->xdp_tx_ring[smp_processor_id()]; prefetchw(ENETC_TXBD(*tx_ring, tx_ring->next_to_use)); @@ -1109,6 +1111,8 @@ int enetc_xdp_xmit(struct net_device *ndev, int num_frames, tx_ring->stats.xdp_tx += xdp_tx_frm_cnt; + enetc_unlock_mdio(); + return xdp_tx_frm_cnt; }