Skip to content

Commit

Permalink
net/mlx4_en: Add accelerated RFS support
Browse files Browse the repository at this point in the history
Use RFS infrastructure and flow steering in HW to keep CPU
affinity of rx interrupts and application per TCP stream.

A flow steering filter is added to the HW whenever the RFS
ndo callback is invoked by core networking code.

Because the invocation takes place in interrupt context, the
actual setup of HW is done using workqueue. Whenever new filter
is added, the driver checks for expiry of existing filters.

Since there's window in time between the point where the core
RFS code invoked the ndo callback, to the point where the HW
is configured from the workqueue context, the 2nd, 3rd etc
packets from that stream will cause the net core to invoke
the callback again and again.

To prevent inefficient/double configuration of the HW, the filters
are kept in a database which is indexed using hash function to enable
fast access.

Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Amir Vadai authored and David S. Miller committed Jul 19, 2012
1 parent d9236c3 commit 1eb8c69
Show file tree
Hide file tree
Showing 4 changed files with 342 additions and 1 deletion.
8 changes: 7 additions & 1 deletion drivers/net/ethernet/mellanox/mlx4/en_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
struct mlx4_en_dev *mdev = priv->mdev;
int err = 0;
char name[25];
struct cpu_rmap *rmap =
#ifdef CONFIG_RFS_ACCEL
priv->dev->rx_cpu_rmap;
#else
NULL;
#endif

cq->dev = mdev->pndev[priv->port];
cq->mcq.set_ci_db = cq->wqres.db.db;
Expand All @@ -91,7 +97,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
sprintf(name, "%s-%d", priv->dev->name,
cq->ring);
/* Set IRQ for specific name (per ring) */
if (mlx4_assign_eq(mdev->dev, name, NULL,
if (mlx4_assign_eq(mdev->dev, name, rmap,
&cq->vector)) {
cq->vector = (cq->ring + 1 + priv->port)
% mdev->dev->caps.num_comp_vectors;
Expand Down
316 changes: 316 additions & 0 deletions drivers/net/ethernet/mellanox/mlx4/en_netdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
#include <linux/if_vlan.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <net/ip.h>

#include <linux/mlx4/driver.h>
#include <linux/mlx4/device.h>
Expand Down Expand Up @@ -66,6 +68,299 @@ static int mlx4_en_setup_tc(struct net_device *dev, u8 up)
return 0;
}

#ifdef CONFIG_RFS_ACCEL

struct mlx4_en_filter {
struct list_head next;
struct work_struct work;

__be32 src_ip;
__be32 dst_ip;
__be16 src_port;
__be16 dst_port;

int rxq_index;
struct mlx4_en_priv *priv;
u32 flow_id; /* RFS infrastructure id */
int id; /* mlx4_en driver id */
u64 reg_id; /* Flow steering API id */
u8 activated; /* Used to prevent expiry before filter
* is attached
*/
struct hlist_node filter_chain;
};

static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv);

static void mlx4_en_filter_work(struct work_struct *work)
{
struct mlx4_en_filter *filter = container_of(work,
struct mlx4_en_filter,
work);
struct mlx4_en_priv *priv = filter->priv;
struct mlx4_spec_list spec_tcp = {
.id = MLX4_NET_TRANS_RULE_ID_TCP,
{
.tcp_udp = {
.dst_port = filter->dst_port,
.dst_port_msk = (__force __be16)-1,
.src_port = filter->src_port,
.src_port_msk = (__force __be16)-1,
},
},
};
struct mlx4_spec_list spec_ip = {
.id = MLX4_NET_TRANS_RULE_ID_IPV4,
{
.ipv4 = {
.dst_ip = filter->dst_ip,
.dst_ip_msk = (__force __be32)-1,
.src_ip = filter->src_ip,
.src_ip_msk = (__force __be32)-1,
},
},
};
struct mlx4_spec_list spec_eth = {
.id = MLX4_NET_TRANS_RULE_ID_ETH,
};
struct mlx4_net_trans_rule rule = {
.list = LIST_HEAD_INIT(rule.list),
.queue_mode = MLX4_NET_TRANS_Q_LIFO,
.exclusive = 1,
.allow_loopback = 1,
.promisc_mode = MLX4_FS_PROMISC_NONE,
.port = priv->port,
.priority = MLX4_DOMAIN_RFS,
};
int rc;
__be64 mac;
__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);

list_add_tail(&spec_eth.list, &rule.list);
list_add_tail(&spec_ip.list, &rule.list);
list_add_tail(&spec_tcp.list, &rule.list);

mac = cpu_to_be64((priv->mac & MLX4_MAC_MASK) << 16);

rule.qpn = priv->rss_map.qps[filter->rxq_index].qpn;
memcpy(spec_eth.eth.dst_mac, &mac, ETH_ALEN);
memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);

filter->activated = 0;

if (filter->reg_id) {
rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
if (rc && rc != -ENOENT)
en_err(priv, "Error detaching flow. rc = %d\n", rc);
}

rc = mlx4_flow_attach(priv->mdev->dev, &rule, &filter->reg_id);
if (rc)
en_err(priv, "Error attaching flow. err = %d\n", rc);

mlx4_en_filter_rfs_expire(priv);

filter->activated = 1;
}

static inline struct hlist_head *
filter_hash_bucket(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
__be16 src_port, __be16 dst_port)
{
unsigned long l;
int bucket_idx;

l = (__force unsigned long)src_port |
((__force unsigned long)dst_port << 2);
l ^= (__force unsigned long)(src_ip ^ dst_ip);

bucket_idx = hash_long(l, MLX4_EN_FILTER_HASH_SHIFT);

return &priv->filter_hash[bucket_idx];
}

static struct mlx4_en_filter *
mlx4_en_filter_alloc(struct mlx4_en_priv *priv, int rxq_index, __be32 src_ip,
__be32 dst_ip, __be16 src_port, __be16 dst_port,
u32 flow_id)
{
struct mlx4_en_filter *filter = NULL;

filter = kzalloc(sizeof(struct mlx4_en_filter), GFP_ATOMIC);
if (!filter)
return NULL;

filter->priv = priv;
filter->rxq_index = rxq_index;
INIT_WORK(&filter->work, mlx4_en_filter_work);

filter->src_ip = src_ip;
filter->dst_ip = dst_ip;
filter->src_port = src_port;
filter->dst_port = dst_port;

filter->flow_id = flow_id;

filter->id = priv->last_filter_id++;

list_add_tail(&filter->next, &priv->filters);
hlist_add_head(&filter->filter_chain,
filter_hash_bucket(priv, src_ip, dst_ip, src_port,
dst_port));

return filter;
}

static void mlx4_en_filter_free(struct mlx4_en_filter *filter)
{
struct mlx4_en_priv *priv = filter->priv;
int rc;

list_del(&filter->next);

rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
if (rc && rc != -ENOENT)
en_err(priv, "Error detaching flow. rc = %d\n", rc);

kfree(filter);
}

static inline struct mlx4_en_filter *
mlx4_en_filter_find(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
__be16 src_port, __be16 dst_port)
{
struct hlist_node *elem;
struct mlx4_en_filter *filter;
struct mlx4_en_filter *ret = NULL;

hlist_for_each_entry(filter, elem,
filter_hash_bucket(priv, src_ip, dst_ip,
src_port, dst_port),
filter_chain) {
if (filter->src_ip == src_ip &&
filter->dst_ip == dst_ip &&
filter->src_port == src_port &&
filter->dst_port == dst_port) {
ret = filter;
break;
}
}

return ret;
}

static int
mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
u16 rxq_index, u32 flow_id)
{
struct mlx4_en_priv *priv = netdev_priv(net_dev);
struct mlx4_en_filter *filter;
const struct iphdr *ip;
const __be16 *ports;
__be32 src_ip;
__be32 dst_ip;
__be16 src_port;
__be16 dst_port;
int nhoff = skb_network_offset(skb);
int ret = 0;

if (skb->protocol != htons(ETH_P_IP))
return -EPROTONOSUPPORT;

ip = (const struct iphdr *)(skb->data + nhoff);
if (ip_is_fragment(ip))
return -EPROTONOSUPPORT;

ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);

src_ip = ip->saddr;
dst_ip = ip->daddr;
src_port = ports[0];
dst_port = ports[1];

if (ip->protocol != IPPROTO_TCP)
return -EPROTONOSUPPORT;

spin_lock_bh(&priv->filters_lock);
filter = mlx4_en_filter_find(priv, src_ip, dst_ip, src_port, dst_port);
if (filter) {
if (filter->rxq_index == rxq_index)
goto out;

filter->rxq_index = rxq_index;
} else {
filter = mlx4_en_filter_alloc(priv, rxq_index,
src_ip, dst_ip,
src_port, dst_port, flow_id);
if (!filter) {
ret = -ENOMEM;
goto err;
}
}

queue_work(priv->mdev->workqueue, &filter->work);

out:
ret = filter->id;
err:
spin_unlock_bh(&priv->filters_lock);

return ret;
}

void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring *rx_ring)
{
struct mlx4_en_filter *filter, *tmp;
LIST_HEAD(del_list);

spin_lock_bh(&priv->filters_lock);
list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
list_move(&filter->next, &del_list);
hlist_del(&filter->filter_chain);
}
spin_unlock_bh(&priv->filters_lock);

list_for_each_entry_safe(filter, tmp, &del_list, next) {
cancel_work_sync(&filter->work);
mlx4_en_filter_free(filter);
}
}

static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv)
{
struct mlx4_en_filter *filter = NULL, *tmp, *last_filter = NULL;
LIST_HEAD(del_list);
int i = 0;

spin_lock_bh(&priv->filters_lock);
list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
if (i > MLX4_EN_FILTER_EXPIRY_QUOTA)
break;

if (filter->activated &&
!work_pending(&filter->work) &&
rps_may_expire_flow(priv->dev,
filter->rxq_index, filter->flow_id,
filter->id)) {
list_move(&filter->next, &del_list);
hlist_del(&filter->filter_chain);
} else
last_filter = filter;

i++;
}

if (last_filter && (&last_filter->next != priv->filters.next))
list_move(&priv->filters, &last_filter->next);

spin_unlock_bh(&priv->filters_lock);

list_for_each_entry_safe(filter, tmp, &del_list, next)
mlx4_en_filter_free(filter);
}
#endif

static int mlx4_en_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
Expand Down Expand Up @@ -1079,6 +1374,11 @@ void mlx4_en_free_resources(struct mlx4_en_priv *priv)
{
int i;

#ifdef CONFIG_RFS_ACCEL
free_irq_cpu_rmap(priv->dev->rx_cpu_rmap);
priv->dev->rx_cpu_rmap = NULL;
#endif

for (i = 0; i < priv->tx_ring_num; i++) {
if (priv->tx_ring[i].tx_info)
mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]);
Expand Down Expand Up @@ -1134,6 +1434,15 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
goto err;
}

#ifdef CONFIG_RFS_ACCEL
priv->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->rx_ring_num);
if (!priv->dev->rx_cpu_rmap)
goto err;

INIT_LIST_HEAD(&priv->filters);
spin_lock_init(&priv->filters_lock);
#endif

return 0;

err:
Expand Down Expand Up @@ -1241,6 +1550,9 @@ static const struct net_device_ops mlx4_netdev_ops = {
#endif
.ndo_set_features = mlx4_en_set_features,
.ndo_setup_tc = mlx4_en_setup_tc,
#ifdef CONFIG_RFS_ACCEL
.ndo_rx_flow_steer = mlx4_en_filter_rfs,
#endif
};

int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
Expand Down Expand Up @@ -1358,6 +1670,10 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
NETIF_F_HW_VLAN_FILTER;
dev->hw_features |= NETIF_F_LOOPBACK;

if (mdev->dev->caps.steering_mode ==
MLX4_STEERING_MODE_DEVICE_MANAGED)
dev->hw_features |= NETIF_F_NTUPLE;

mdev->pndev[port] = dev;

netif_carrier_off(dev);
Expand Down
3 changes: 3 additions & 0 deletions drivers/net/ethernet/mellanox/mlx4/en_rx.c
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,9 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
vfree(ring->rx_info);
ring->rx_info = NULL;
#ifdef CONFIG_RFS_ACCEL
mlx4_en_cleanup_filters(priv, ring);
#endif
}

void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
Expand Down
Loading

0 comments on commit 1eb8c69

Please sign in to comment.