Skip to content

Commit

Permalink
Merge branch 'XDP-redirect-memory-return-API'
Browse files Browse the repository at this point in the history
Jesper Dangaard Brouer says:

====================
XDP redirect memory return API

Submitted against net-next, as it contains NIC driver changes.

This patchset works towards supporting different XDP RX-ring memory
allocators.  As this will be needed by the AF_XDP zero-copy mode.

The patchset uses mlx5 as the sample driver, which gets implemented
XDP_REDIRECT RX-mode, but not ndo_xdp_xmit (as this API is subject to
change thought the patchset).

A new struct xdp_frame is introduced (modeled after cpumap xdp_pkt).
And both ndo_xdp_xmit and the new xdp_return_frame end-up using this.

Support for a driver supplied allocator is implemented, and a
refurbished version of page_pool is the first return allocator type
introduced.  This will be a integration point for AF_XDP zero-copy.

The mlx5 driver evolve into using the page_pool, and see a performance
increase (with ndo_xdp_xmit out ixgbe driver) from 6Mpps to 12Mpps.

The patchset stop at 16 patches (one over limit), but more API changes
are planned.  Specifically extending ndo_xdp_xmit and xdp_return_frame
APIs to support bulking.  As this will address some known limits.

V2: Updated according to Tariq's feedback
V3: Updated based on feedback from Jason Wang and Alex Duyck
V4: Updated based on feedback from Tariq and Jason
V5: Fix SPDX license, add Tariq's reviews, improve patch desc for perf test
V6: Updated based on feedback from Eric Dumazet and Alex Duyck
V7: Adapt to i40e that got XDP_REDIRECT support in-between
V8:
 Updated based on feedback kbuild test robot, and adjust for mlx5 changes
 page_pool only compiled into kernel when drivers Kconfig 'select' feature
V9:
 Remove some inline statements, let compiler decide what to inline
 Fix return value in virtio_net driver
 Adjust for mlx5 changes in-between submissions
V10:
 Minor adjust for mlx5 requested by Tariq
 Resubmit against net-next
V11: avoid leaking info stored in frame data on page reuse
====================

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Apr 17, 2018
2 parents 897ddc2 + 6dfb970 commit 684009d
Show file tree
Hide file tree
Showing 22 changed files with 1,102 additions and 198 deletions.
33 changes: 23 additions & 10 deletions drivers/net/ethernet/intel/i40e/i40e_txrx.c
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
kfree(tx_buffer->raw_buf);
else if (ring_is_xdp(ring))
page_frag_free(tx_buffer->raw_buf);
xdp_return_frame(tx_buffer->xdpf);
else
dev_kfree_skb_any(tx_buffer->skb);
if (dma_unmap_len(tx_buffer, len))
Expand Down Expand Up @@ -841,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,

/* free the skb/XDP data */
if (ring_is_xdp(tx_ring))
page_frag_free(tx_buf->raw_buf);
xdp_return_frame(tx_buf->xdpf);
else
napi_consume_skb(tx_buf->skb, napi_budget);

Expand Down Expand Up @@ -2203,9 +2203,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
#define I40E_XDP_CONSUMED 1
#define I40E_XDP_TX 2

static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
struct i40e_ring *xdp_ring);

static int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp,
struct i40e_ring *xdp_ring)
{
struct xdp_frame *xdpf = convert_to_xdp_frame(xdp);

if (unlikely(!xdpf))
return I40E_XDP_CONSUMED;

return i40e_xmit_xdp_ring(xdpf, xdp_ring);
}

/**
* i40e_run_xdp - run an XDP program
* @rx_ring: Rx ring being processed
Expand All @@ -2225,13 +2236,15 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
if (!xdp_prog)
goto xdp_out;

prefetchw(xdp->data_hard_start); /* xdp_frame write */

act = bpf_prog_run_xdp(xdp_prog, xdp);
switch (act) {
case XDP_PASS:
break;
case XDP_TX:
xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
result = i40e_xmit_xdp_ring(xdp, xdp_ring);
result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
break;
case XDP_REDIRECT:
err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
Expand Down Expand Up @@ -3478,28 +3491,28 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
* @xdp: data to transmit
* @xdp_ring: XDP Tx ring
**/
static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
struct i40e_ring *xdp_ring)
{
u32 size = xdp->data_end - xdp->data;
u16 i = xdp_ring->next_to_use;
struct i40e_tx_buffer *tx_bi;
struct i40e_tx_desc *tx_desc;
u32 size = xdpf->len;
dma_addr_t dma;

if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
xdp_ring->tx_stats.tx_busy++;
return I40E_XDP_CONSUMED;
}

dma = dma_map_single(xdp_ring->dev, xdp->data, size, DMA_TO_DEVICE);
dma = dma_map_single(xdp_ring->dev, xdpf->data, size, DMA_TO_DEVICE);
if (dma_mapping_error(xdp_ring->dev, dma))
return I40E_XDP_CONSUMED;

tx_bi = &xdp_ring->tx_bi[i];
tx_bi->bytecount = size;
tx_bi->gso_segs = 1;
tx_bi->raw_buf = xdp->data;
tx_bi->xdpf = xdpf;

/* record length, and DMA address */
dma_unmap_len_set(tx_bi, len, size);
Expand Down Expand Up @@ -3675,7 +3688,7 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
*
* Returns Zero if sent, else an error code
**/
int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
{
struct i40e_netdev_priv *np = netdev_priv(dev);
unsigned int queue_index = smp_processor_id();
Expand All @@ -3688,7 +3701,7 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
return -ENXIO;

err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]);
err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
if (err != I40E_XDP_TX)
return -ENOSPC;

Expand Down
3 changes: 2 additions & 1 deletion drivers/net/ethernet/intel/i40e/i40e_txrx.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
struct i40e_tx_buffer {
struct i40e_tx_desc *next_to_watch;
union {
struct xdp_frame *xdpf;
struct sk_buff *skb;
void *raw_buf;
};
Expand Down Expand Up @@ -510,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
void i40e_detect_recover_hung(struct i40e_vsi *vsi);
int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp);
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
void i40e_xdp_flush(struct net_device *dev);

/**
Expand Down
3 changes: 1 addition & 2 deletions drivers/net/ethernet/intel/ixgbe/ixgbe.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,7 @@ struct ixgbe_tx_buffer {
unsigned long time_stamp;
union {
struct sk_buff *skb;
/* XDP uses address ptr on irq_clean */
void *data;
struct xdp_frame *xdpf;
};
unsigned int bytecount;
unsigned short gso_segs;
Expand Down
38 changes: 27 additions & 11 deletions drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,

/* free the skb */
if (ring_is_xdp(tx_ring))
page_frag_free(tx_buffer->data);
xdp_return_frame(tx_buffer->xdpf);
else
napi_consume_skb(tx_buffer->skb, napi_budget);

Expand Down Expand Up @@ -2262,14 +2262,15 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
#define IXGBE_XDP_TX 2

static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
struct xdp_buff *xdp);
struct xdp_frame *xdpf);

static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
struct ixgbe_ring *rx_ring,
struct xdp_buff *xdp)
{
int err, result = IXGBE_XDP_PASS;
struct bpf_prog *xdp_prog;
struct xdp_frame *xdpf;
u32 act;

rcu_read_lock();
Expand All @@ -2278,12 +2279,19 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
if (!xdp_prog)
goto xdp_out;

prefetchw(xdp->data_hard_start); /* xdp_frame write */

act = bpf_prog_run_xdp(xdp_prog, xdp);
switch (act) {
case XDP_PASS:
break;
case XDP_TX:
result = ixgbe_xmit_xdp_ring(adapter, xdp);
xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf)) {
result = IXGBE_XDP_CONSUMED;
break;
}
result = ixgbe_xmit_xdp_ring(adapter, xdpf);
break;
case XDP_REDIRECT:
err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
Expand Down Expand Up @@ -5797,7 +5805,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)

/* Free all the Tx ring sk_buffs */
if (ring_is_xdp(tx_ring))
page_frag_free(tx_buffer->data);
xdp_return_frame(tx_buffer->xdpf);
else
dev_kfree_skb_any(tx_buffer->skb);

Expand Down Expand Up @@ -6370,7 +6378,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
struct device *dev = rx_ring->dev;
int orig_node = dev_to_node(dev);
int ring_node = -1;
int size;
int size, err;

size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;

Expand Down Expand Up @@ -6407,6 +6415,13 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
rx_ring->queue_index) < 0)
goto err;

err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
MEM_TYPE_PAGE_SHARED, NULL);
if (err) {
xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
goto err;
}

rx_ring->xdp_prog = adapter->xdp_prog;

return 0;
Expand Down Expand Up @@ -8336,7 +8351,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
}

static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
struct xdp_buff *xdp)
struct xdp_frame *xdpf)
{
struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
struct ixgbe_tx_buffer *tx_buffer;
Expand All @@ -8345,12 +8360,12 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
dma_addr_t dma;
u16 i;

len = xdp->data_end - xdp->data;
len = xdpf->len;

if (unlikely(!ixgbe_desc_unused(ring)))
return IXGBE_XDP_CONSUMED;

dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE);
dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE);
if (dma_mapping_error(ring->dev, dma))
return IXGBE_XDP_CONSUMED;

Expand All @@ -8365,7 +8380,8 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,

dma_unmap_len_set(tx_buffer, len, len);
dma_unmap_addr_set(tx_buffer, dma, dma);
tx_buffer->data = xdp->data;
tx_buffer->xdpf = xdpf;

tx_desc->read.buffer_addr = cpu_to_le64(dma);

/* put descriptor type bits */
Expand Down Expand Up @@ -9996,7 +10012,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
}
}

static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
{
struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring;
Expand All @@ -10012,7 +10028,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
if (unlikely(!ring))
return -ENXIO;

err = ixgbe_xmit_xdp_ring(adapter, xdp);
err = ixgbe_xmit_xdp_ring(adapter, xdpf);
if (err != IXGBE_XDP_TX)
return -ENOSPC;

Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ config MLX5_CORE_EN
bool "Mellanox Technologies ConnectX-4 Ethernet support"
depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
depends on IPV6=y || IPV6=n || MLX5_CORE=m
select PAGE_POOL
default n
---help---
Ethernet support in Mellanox Technologies ConnectX-4 NIC.
Expand Down
4 changes: 4 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/en.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
#include "mlx5_core.h"
#include "en_stats.h"

struct page_pool;

#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)

#define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
Expand Down Expand Up @@ -392,6 +394,7 @@ struct mlx5e_xdpsq {
struct {
struct mlx5e_dma_info *di;
bool doorbell;
bool redirect_flush;
} db;

/* read only */
Expand Down Expand Up @@ -533,6 +536,7 @@ struct mlx5e_rq {
unsigned int hw_mtu;
struct mlx5e_xdpsq xdpsq;
DECLARE_BITMAP(flags, 8);
struct page_pool *page_pool;

/* control */
struct mlx5_wq_ctrl wq_ctrl;
Expand Down
37 changes: 36 additions & 1 deletion drivers/net/ethernet/mellanox/mlx5/core/en_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <linux/mlx5/fs.h>
#include <net/vxlan.h>
#include <linux/bpf.h>
#include <net/page_pool.h>
#include "eswitch.h"
#include "en.h"
#include "en_tc.h"
Expand Down Expand Up @@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5e_rq_param *rqp,
struct mlx5e_rq *rq)
{
struct page_pool_params pp_params = { 0 };
struct mlx5_core_dev *mdev = c->mdev;
void *rqc = rqp->rqc;
void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
u32 byte_count;
u32 byte_count, pool_size;
int npages;
int wq_sz;
int err;
Expand Down Expand Up @@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,

rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
pool_size = 1 << params->log_rq_mtu_frames;

switch (rq->wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:

pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
rq->post_wqes = mlx5e_post_rx_mpwqes;
rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;

Expand Down Expand Up @@ -512,6 +517,32 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->mkey_be = c->mkey_be;
}

/* Create a page_pool and register it with rxq */
pp_params.order = rq->buff.page_order;
pp_params.flags = 0; /* No-internal DMA mapping in page_pool */
pp_params.pool_size = pool_size;
pp_params.nid = cpu_to_node(c->cpu);
pp_params.dev = c->pdev;
pp_params.dma_dir = rq->buff.map_dir;

/* page_pool can be used even when there is no rq->xdp_prog,
* given page_pool does not handle DMA mapping there is no
* required state to clear. And page_pool gracefully handle
* elevated refcnt.
*/
rq->page_pool = page_pool_create(&pp_params);
if (IS_ERR(rq->page_pool)) {
if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
kfree(rq->wqe.frag_info);
err = PTR_ERR(rq->page_pool);
rq->page_pool = NULL;
goto err_rq_wq_destroy;
}
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_PAGE_POOL, rq->page_pool);
if (err)
goto err_rq_wq_destroy;

for (i = 0; i < wq_sz; i++) {
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);

Expand Down Expand Up @@ -548,6 +579,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
if (rq->xdp_prog)
bpf_prog_put(rq->xdp_prog);
xdp_rxq_info_unreg(&rq->xdp_rxq);
if (rq->page_pool)
page_pool_destroy(rq->page_pool);
mlx5_wq_destroy(&rq->wq_ctrl);

return err;
Expand All @@ -561,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
bpf_prog_put(rq->xdp_prog);

xdp_rxq_info_unreg(&rq->xdp_rxq);
if (rq->page_pool)
page_pool_destroy(rq->page_pool);

switch (rq->wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
Expand Down
Loading

0 comments on commit 684009d

Please sign in to comment.