Skip to content

Commit

Permalink
xps: Transmit Packet Steering
Browse files Browse the repository at this point in the history
This patch implements transmit packet steering (XPS) for multiqueue
devices.  XPS selects a transmit queue during packet transmission based
on configuration.  This is done by mapping the CPU transmitting the
packet to a queue.  This is the transmit side analogue to RPS-- where
RPS is selecting a CPU based on receive queue, XPS selects a queue
based on the CPU (previously there was an XPS patch from Eric
Dumazet, but that might more appropriately be called transmit completion
steering).

Each transmit queue can be associated with a number of CPUs which will
use the queue to send packets.  This is configured as a CPU mask on a
per queue basis in:

/sys/class/net/eth<n>/queues/tx-<n>/xps_cpus

The mappings are stored per device in an inverted data structure that
maps CPUs to queues.  In the netdevice structure this is an array of
num_possible_cpu structures where each structure holds and array of
queue_indexes for queues which that CPU can use.

The benefits of XPS are improved locality in the per queue data
structures.  Also, transmit completions are more likely to be done
nearer to the sending thread, so this should promote locality back
to the socket on free (e.g. UDP).  The benefits of XPS are dependent on
cache hierarchy, application load, and other factors.  XPS would
nominally be configured so that a queue would only be shared by CPUs
which are sharing a cache, the degenerative configuration woud be that
each CPU has it's own queue.

Below are some benchmark results which show the potential benfit of
this patch.  The netperf test has 500 instances of netperf TCP_RR test
with 1 byte req. and resp.

bnx2x on 16 core AMD
   XPS (16 queues, 1 TX queue per CPU)  1234K at 100% CPU
   No XPS (16 queues)                   996K at 100% CPU

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Tom Herbert authored and David S. Miller committed Nov 24, 2010
1 parent 3853b58 commit 1d24eb4
Show file tree
Hide file tree
Showing 4 changed files with 447 additions and 8 deletions.
30 changes: 30 additions & 0 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,10 @@ struct netdev_queue {
struct Qdisc *qdisc;
unsigned long state;
struct Qdisc *qdisc_sleeping;
#ifdef CONFIG_RPS
struct kobject kobj;
#endif

/*
* write mostly part
*/
Expand All @@ -529,6 +533,30 @@ struct rps_map {
};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))

/*
* This structure holds an XPS map which can be of variable length. The
* map is an array of queues.
*/
struct xps_map {
unsigned int len;
unsigned int alloc_len;
struct rcu_head rcu;
u16 queues[0];
};
#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + (_num * sizeof(u16)))
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_BYTES - sizeof(struct xps_map)) \
/ sizeof(u16))

/*
* This structure holds all XPS maps for device. Maps are indexed by CPU.
*/
struct xps_dev_maps {
struct rcu_head rcu;
struct xps_map *cpu_map[0];
};
#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * sizeof(struct xps_map *)))

/*
* The rps_dev_flow structure contains the mapping of a flow to a CPU and the
* tail pointer for that CPU's input queue at the time of last enqueue.
Expand Down Expand Up @@ -1016,6 +1044,8 @@ struct net_device {
unsigned long tx_queue_len; /* Max frames per queue allowed */
spinlock_t tx_global_lock;

struct xps_dev_maps *xps_maps;

/* These may be needed for future network-power-down code. */

/*
Expand Down
53 changes: 50 additions & 3 deletions net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -1557,12 +1557,16 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
*/
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
int rc;

if (txq < 1 || txq > dev->num_tx_queues)
return -EINVAL;

if (dev->reg_state == NETREG_REGISTERED) {
ASSERT_RTNL();

rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
txq);
if (txq < dev->real_num_tx_queues)
qdisc_reset_all_tx_gt(dev, txq);
}
Expand Down Expand Up @@ -2142,6 +2146,44 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
return queue_index;
}

static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_RPS
struct xps_dev_maps *dev_maps;
struct xps_map *map;
int queue_index = -1;

rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
map = rcu_dereference(
dev_maps->cpu_map[raw_smp_processor_id()]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
else {
u32 hash;
if (skb->sk && skb->sk->sk_hash)
hash = skb->sk->sk_hash;
else
hash = (__force u16) skb->protocol ^
skb->rxhash;
hash = jhash_1word(hash, hashrnd);
queue_index = map->queues[
((u64)hash * map->len) >> 32];
}
if (unlikely(queue_index >= dev->real_num_tx_queues))
queue_index = -1;
}
}
rcu_read_unlock();

return queue_index;
#else
return -1;
#endif
}

static struct netdev_queue *dev_pick_tx(struct net_device *dev,
struct sk_buff *skb)
{
Expand All @@ -2161,7 +2203,9 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
queue_index >= dev->real_num_tx_queues) {
int old_index = queue_index;

queue_index = skb_tx_hash(dev, skb);
queue_index = get_xps_queue(dev, skb);
if (queue_index < 0)
queue_index = skb_tx_hash(dev, skb);

if (queue_index != old_index && sk) {
struct dst_entry *dst =
Expand Down Expand Up @@ -5066,6 +5110,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
{
unsigned int count = dev->num_tx_queues;
struct netdev_queue *tx;
int i;

BUG_ON(count < 1);

Expand All @@ -5076,15 +5121,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
return -ENOMEM;
}
dev->_tx = tx;

for (i = 0; i < count; i++)
tx[i].dev = dev;

return 0;
}

static void netdev_init_one_queue(struct net_device *dev,
struct netdev_queue *queue,
void *_unused)
{
queue->dev = dev;

/* Initialize queue lock */
spin_lock_init(&queue->_xmit_lock);
netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
Expand Down
Loading

0 comments on commit 1d24eb4

Please sign in to comment.