Skip to content

Commit

Permalink
Merge branch 'aquantia-rx-perf'
Browse files Browse the repository at this point in the history
Igor Russkikh says:

====================
net: aquantia: RX performance optimization patches

Here is a set of patches targeting for performance improvement
on various platforms and protocols.

Our main target was rx performance on iommu systems, notably
NVIDIA Jetson TX2 and NVIDIA Xavier platforms.

We introduce page reuse strategy to better deal with iommu dma mapping costs.
With it we see 80-90% of page reuse under some test configurations on UDP traffic.

This shows good improvements on other systems with IOMMU hardware, like
AMD Ryzen.

We've also improved TCP LRO configuration parameters, allowing packets to better
coalesce.

Page reuse tests were carried out using iperf3, iperf2, netperf and pktgen.
Mainly on UDP traffic, with various packet lengths.

Jetson TX2, UDP, Default MTU:
RX Lost Datagrams
  Before: Max: 69%  Min: 68% Avg: 68.5%
  After:  Max: 41%  Min: 38% Avg: 39.2%
Maximum throughput
  Before: 1.27 Gbits/sec
  After:  2.41 Gbits/sec

AMD Ryzen 5 2400G, UDP, Default MTU:
RX Lost Datagrams
  Before:  Max: 12%  Min: 4.5% Avg: 7.17%
  After:   Max: 6.2% Min: 2.3% Avg: 4.26%
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Mar 24, 2019
2 parents d64fee0 + d0d443c commit 956ca8f
Show file tree
Hide file tree
Showing 13 changed files with 223 additions and 72 deletions.
3 changes: 2 additions & 1 deletion drivers/net/ethernet/aquantia/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ if NET_VENDOR_AQUANTIA

config AQTION
tristate "aQuantia AQtion(tm) Support"
depends on PCI && X86_64
depends on PCI
depends on X86_64 || ARM64 || COMPILE_TEST
---help---
This enables the support for the aQuantia AQtion(tm) Ethernet card.

Expand Down
10 changes: 8 additions & 2 deletions drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#define AQ_CFG_TCS_DEF 1U

#define AQ_CFG_TXDS_DEF 4096U
#define AQ_CFG_RXDS_DEF 1024U
#define AQ_CFG_RXDS_DEF 2048U

#define AQ_CFG_IS_POLLING_DEF 0U

Expand All @@ -34,10 +34,16 @@
#define AQ_CFG_TCS_MAX 8U

#define AQ_CFG_TX_FRAME_MAX (16U * 1024U)
#define AQ_CFG_RX_FRAME_MAX (4U * 1024U)
#define AQ_CFG_RX_FRAME_MAX (2U * 1024U)

#define AQ_CFG_TX_CLEAN_BUDGET 256U

#define AQ_CFG_RX_REFILL_THRES 32U

#define AQ_CFG_RX_HDR_SIZE 256U

#define AQ_CFG_RX_PAGEORDER 0U

/* LRO */
#define AQ_CFG_IS_LRO_DEF 1U

Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/aquantia/atlantic/aq_nic.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ void aq_nic_cfg_start(struct aq_nic_s *self)
cfg->tx_itr = aq_itr_tx;
cfg->rx_itr = aq_itr_rx;

cfg->rxpageorder = AQ_CFG_RX_PAGEORDER;
cfg->is_rss = AQ_CFG_IS_RSS_DEF;
cfg->num_rss_queues = AQ_CFG_NUM_RSS_QUEUES_DEF;
cfg->aq_rss.base_cpu_number = AQ_CFG_RSS_BASE_CPU_NUM_DEF;
Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/aquantia/atlantic/aq_nic.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ struct aq_nic_cfg_s {
u32 itr;
u16 rx_itr;
u16 tx_itr;
u32 rxpageorder;
u32 num_rss_queues;
u32 mtu;
u32 flow_control;
Expand Down
187 changes: 138 additions & 49 deletions drivers/net/ethernet/aquantia/atlantic/aq_ring.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,89 @@
#include "aq_ring.h"
#include "aq_nic.h"
#include "aq_hw.h"
#include "aq_hw_utils.h"

#include <linux/netdevice.h>
#include <linux/etherdevice.h>

static inline void aq_free_rxpage(struct aq_rxpage *rxpage, struct device *dev)
{
unsigned int len = PAGE_SIZE << rxpage->order;

dma_unmap_page(dev, rxpage->daddr, len, DMA_FROM_DEVICE);

/* Drop the ref for being in the ring. */
__free_pages(rxpage->page, rxpage->order);
rxpage->page = NULL;
}

static int aq_get_rxpage(struct aq_rxpage *rxpage, unsigned int order,
struct device *dev)
{
struct page *page;
dma_addr_t daddr;
int ret = -ENOMEM;

page = dev_alloc_pages(order);
if (unlikely(!page))
goto err_exit;

daddr = dma_map_page(dev, page, 0, PAGE_SIZE << order,
DMA_FROM_DEVICE);

if (unlikely(dma_mapping_error(dev, daddr)))
goto free_page;

rxpage->page = page;
rxpage->daddr = daddr;
rxpage->order = order;
rxpage->pg_off = 0;

return 0;

free_page:
__free_pages(page, order);

err_exit:
return ret;
}

static int aq_get_rxpages(struct aq_ring_s *self, struct aq_ring_buff_s *rxbuf,
int order)
{
int ret;

if (rxbuf->rxdata.page) {
/* One means ring is the only user and can reuse */
if (page_ref_count(rxbuf->rxdata.page) > 1) {
/* Try reuse buffer */
rxbuf->rxdata.pg_off += AQ_CFG_RX_FRAME_MAX;
if (rxbuf->rxdata.pg_off + AQ_CFG_RX_FRAME_MAX <=
(PAGE_SIZE << order)) {
self->stats.rx.pg_flips++;
} else {
/* Buffer exhausted. We have other users and
* should release this page and realloc
*/
aq_free_rxpage(&rxbuf->rxdata,
aq_nic_get_dev(self->aq_nic));
self->stats.rx.pg_losts++;
}
} else {
rxbuf->rxdata.pg_off = 0;
self->stats.rx.pg_reuses++;
}
}

if (!rxbuf->rxdata.page) {
ret = aq_get_rxpage(&rxbuf->rxdata, order,
aq_nic_get_dev(self->aq_nic));
return ret;
}

return 0;
}

static struct aq_ring_s *aq_ring_alloc(struct aq_ring_s *self,
struct aq_nic_s *aq_nic)
{
Expand Down Expand Up @@ -81,6 +160,11 @@ struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self,
self->idx = idx;
self->size = aq_nic_cfg->rxds;
self->dx_size = aq_nic_cfg->aq_hw_caps->rxd_size;
self->page_order = fls(AQ_CFG_RX_FRAME_MAX / PAGE_SIZE +
(AQ_CFG_RX_FRAME_MAX % PAGE_SIZE ? 1 : 0)) - 1;

if (aq_nic_cfg->rxpageorder > self->page_order)
self->page_order = aq_nic_cfg->rxpageorder;

self = aq_ring_alloc(self, aq_nic);
if (!self) {
Expand Down Expand Up @@ -201,22 +285,21 @@ int aq_ring_rx_clean(struct aq_ring_s *self,
int budget)
{
struct net_device *ndev = aq_nic_get_ndev(self->aq_nic);
int err = 0;
bool is_rsc_completed = true;
int err = 0;

for (; (self->sw_head != self->hw_head) && budget;
self->sw_head = aq_ring_next_dx(self, self->sw_head),
--budget, ++(*work_done)) {
struct aq_ring_buff_s *buff = &self->buff_ring[self->sw_head];
struct aq_ring_buff_s *buff_ = NULL;
struct sk_buff *skb = NULL;
unsigned int next_ = 0U;
unsigned int i = 0U;
struct aq_ring_buff_s *buff_ = NULL;
u16 hdr_len;

if (buff->is_error) {
__free_pages(buff->page, 0);
if (buff->is_error)
continue;
}

if (buff->is_cleaned)
continue;
Expand Down Expand Up @@ -246,45 +329,66 @@ int aq_ring_rx_clean(struct aq_ring_s *self,
}
}

dma_sync_single_range_for_cpu(aq_nic_get_dev(self->aq_nic),
buff->rxdata.daddr,
buff->rxdata.pg_off,
buff->len, DMA_FROM_DEVICE);

/* for single fragment packets use build_skb() */
if (buff->is_eop &&
buff->len <= AQ_CFG_RX_FRAME_MAX - AQ_SKB_ALIGN) {
skb = build_skb(page_address(buff->page),
skb = build_skb(aq_buf_vaddr(&buff->rxdata),
AQ_CFG_RX_FRAME_MAX);
if (unlikely(!skb)) {
err = -ENOMEM;
goto err_exit;
}

skb_put(skb, buff->len);
page_ref_inc(buff->rxdata.page);
} else {
skb = netdev_alloc_skb(ndev, ETH_HLEN);
skb = napi_alloc_skb(napi, AQ_CFG_RX_HDR_SIZE);
if (unlikely(!skb)) {
err = -ENOMEM;
goto err_exit;
}
skb_put(skb, ETH_HLEN);
memcpy(skb->data, page_address(buff->page), ETH_HLEN);

skb_add_rx_frag(skb, 0, buff->page, ETH_HLEN,
buff->len - ETH_HLEN,
SKB_TRUESIZE(buff->len - ETH_HLEN));
hdr_len = buff->len;
if (hdr_len > AQ_CFG_RX_HDR_SIZE)
hdr_len = eth_get_headlen(aq_buf_vaddr(&buff->rxdata),
AQ_CFG_RX_HDR_SIZE);

memcpy(__skb_put(skb, hdr_len), aq_buf_vaddr(&buff->rxdata),
ALIGN(hdr_len, sizeof(long)));

if (buff->len - hdr_len > 0) {
skb_add_rx_frag(skb, 0, buff->rxdata.page,
buff->rxdata.pg_off + hdr_len,
buff->len - hdr_len,
AQ_CFG_RX_FRAME_MAX);
page_ref_inc(buff->rxdata.page);
}

if (!buff->is_eop) {
for (i = 1U, next_ = buff->next,
buff_ = &self->buff_ring[next_];
true; next_ = buff_->next,
buff_ = &self->buff_ring[next_], ++i) {
skb_add_rx_frag(skb, i,
buff_->page, 0,
buff_ = buff;
i = 1U;
do {
next_ = buff_->next,
buff_ = &self->buff_ring[next_];

dma_sync_single_range_for_cpu(
aq_nic_get_dev(self->aq_nic),
buff_->rxdata.daddr,
buff_->rxdata.pg_off,
buff_->len,
SKB_TRUESIZE(buff->len -
ETH_HLEN));
DMA_FROM_DEVICE);
skb_add_rx_frag(skb, i++,
buff_->rxdata.page,
buff_->rxdata.pg_off,
buff_->len,
AQ_CFG_RX_FRAME_MAX);
page_ref_inc(buff_->rxdata.page);
buff_->is_cleaned = 1;

if (buff_->is_eop)
break;
}
} while (!buff_->is_eop);
}
}

Expand All @@ -310,43 +414,31 @@ int aq_ring_rx_clean(struct aq_ring_s *self,

int aq_ring_rx_fill(struct aq_ring_s *self)
{
unsigned int pages_order = fls(AQ_CFG_RX_FRAME_MAX / PAGE_SIZE +
(AQ_CFG_RX_FRAME_MAX % PAGE_SIZE ? 1 : 0)) - 1;
unsigned int page_order = self->page_order;
struct aq_ring_buff_s *buff = NULL;
int err = 0;
int i = 0;

if (aq_ring_avail_dx(self) < min_t(unsigned int, AQ_CFG_RX_REFILL_THRES,
self->size / 2))
return err;

for (i = aq_ring_avail_dx(self); i--;
self->sw_tail = aq_ring_next_dx(self, self->sw_tail)) {
buff = &self->buff_ring[self->sw_tail];

buff->flags = 0U;
buff->len = AQ_CFG_RX_FRAME_MAX;

buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order);
if (!buff->page) {
err = -ENOMEM;
err = aq_get_rxpages(self, buff, page_order);
if (err)
goto err_exit;
}

buff->pa = dma_map_page(aq_nic_get_dev(self->aq_nic),
buff->page, 0,
AQ_CFG_RX_FRAME_MAX, DMA_FROM_DEVICE);

if (dma_mapping_error(aq_nic_get_dev(self->aq_nic), buff->pa)) {
err = -ENOMEM;
goto err_exit;
}

buff->pa = aq_buf_daddr(&buff->rxdata);
buff = NULL;
}

err_exit:
if (err < 0) {
if (buff && buff->page)
__free_pages(buff->page, 0);
}

return err;
}

Expand All @@ -359,10 +451,7 @@ void aq_ring_rx_deinit(struct aq_ring_s *self)
self->sw_head = aq_ring_next_dx(self, self->sw_head)) {
struct aq_ring_buff_s *buff = &self->buff_ring[self->sw_head];

dma_unmap_page(aq_nic_get_dev(self->aq_nic), buff->pa,
AQ_CFG_RX_FRAME_MAX, DMA_FROM_DEVICE);

__free_pages(buff->page, 0);
aq_free_rxpage(&buff->rxdata, aq_nic_get_dev(self->aq_nic));
}

err_exit:;
Expand Down
Loading

0 comments on commit 956ca8f

Please sign in to comment.