Skip to content

Commit

Permalink
xdp: Add batching support to redirect map
Browse files Browse the repository at this point in the history
For performance reasons we want to avoid updating the tail pointer in
the driver tx ring as much as possible. To accomplish this we add
batching support to the redirect path in XDP.

This adds another ndo op "xdp_flush" that is used to inform the driver
that it should bump the tail pointer on the TX ring.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
John Fastabend authored and David S. Miller committed Jul 17, 2017
1 parent 97f91a7 commit 11393cc
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 15 deletions.
28 changes: 25 additions & 3 deletions drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2415,6 +2415,8 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
*/
wmb();
writel(ring->next_to_use, ring->tail);

xdp_do_flush_map();
}

u64_stats_update_begin(&rx_ring->syncp);
Expand Down Expand Up @@ -5817,6 +5819,9 @@ void ixgbe_down(struct ixgbe_adapter *adapter)

usleep_range(10000, 20000);

/* synchronize_sched() needed for pending XDP buffers to drain */
if (adapter->xdp_ring[0])
synchronize_sched();
netif_tx_stop_all_queues(netdev);

/* call carrier off first to avoid false dev_watchdog timeouts */
Expand Down Expand Up @@ -9850,15 +9855,31 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
if (err != IXGBE_XDP_TX)
return -ENOMEM;

return 0;
}

static void ixgbe_xdp_flush(struct net_device *dev)
{
struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring;

/* Its possible the device went down between xdp xmit and flush so
* we need to ensure device is still up.
*/
if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
return;

ring = adapter->xdp_prog ? adapter->xdp_ring[smp_processor_id()] : NULL;
if (unlikely(!ring))
return;

/* Force memory writes to complete before letting h/w know there
* are new descriptors to fetch.
*/
wmb();

ring = adapter->xdp_ring[smp_processor_id()];
writel(ring->next_to_use, ring->tail);

return 0;
return;
}

static const struct net_device_ops ixgbe_netdev_ops = {
Expand Down Expand Up @@ -9908,6 +9929,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_features_check = ixgbe_features_check,
.ndo_xdp = ixgbe_xdp,
.ndo_xdp_xmit = ixgbe_xdp_xmit,
.ndo_xdp_flush = ixgbe_xdp_flush,
};

/**
Expand Down
2 changes: 2 additions & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -381,5 +381,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

/* Map specifics */
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
void __dev_map_flush(struct bpf_map *map);

#endif /* _LINUX_BPF_H */
7 changes: 7 additions & 0 deletions include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -712,10 +712,17 @@ bool bpf_helper_changes_pkt_data(void *func);
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len);

/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the
* same cpu context. Further for best results no more than a single map
* for the do_redirect/do_flush pair should be used. This limitation is
* because we only track one map and force a flush when the map changes.
* This does not appear to be a real limiation for existing software.
*/
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb);
int xdp_do_redirect(struct net_device *dev,
struct xdp_buff *xdp,
struct bpf_prog *prog);
void xdp_do_flush_map(void);

void bpf_warn_invalid_xdp_action(u32 act);
void bpf_warn_invalid_xdp_redirect(u32 ifindex);
Expand Down
5 changes: 4 additions & 1 deletion include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,9 @@ struct xfrmdev_ops {
* int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
* This function is used to submit a XDP packet for transmit on a
* netdevice.
*
* void (*ndo_xdp_flush)(struct net_device *dev);
* This function is used to inform the driver to flush a paticular
* xpd tx queue. Must be called on same CPU as xdp_xmit.
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
Expand Down Expand Up @@ -1329,6 +1331,7 @@ struct net_device_ops {
struct netdev_xdp *xdp);
int (*ndo_xdp_xmit)(struct net_device *dev,
struct xdp_buff *xdp);
void (*ndo_xdp_flush)(struct net_device *dev);
};

/**
Expand Down
84 changes: 83 additions & 1 deletion kernel/bpf/devmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ struct bpf_dtab_netdev {
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map;
unsigned long int __percpu *flush_needed;
};

static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
Expand Down Expand Up @@ -87,6 +88,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)

/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
if (cost >= U32_MAX - PAGE_SIZE)
goto free_dtab;

Expand All @@ -97,6 +99,14 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (err)
goto free_dtab;

/* A per cpu bitfield with a bit per possible net device */
dtab->flush_needed = __alloc_percpu(
BITS_TO_LONGS(attr->max_entries) *
sizeof(unsigned long),
__alignof__(unsigned long));
if (!dtab->flush_needed)
goto free_dtab;

dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *));
if (!dtab->netdev_map)
Expand All @@ -105,14 +115,15 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
return &dtab->map;

free_dtab:
free_percpu(dtab->flush_needed);
kfree(dtab);
return ERR_PTR(err);
}

static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
int i;
int i, cpu;

/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
Expand All @@ -123,6 +134,18 @@ static void dev_map_free(struct bpf_map *map)
*/
synchronize_rcu();

/* To ensure all pending flush operations have completed wait for flush
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
* Because the above synchronize_rcu() ensures the map is disconnected
* from the program we can assume no new bits will be set.
*/
for_each_online_cpu(cpu) {
unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);

while (!bitmap_empty(bitmap, dtab->map.max_entries))
cpu_relax();
}

for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;

Expand All @@ -137,6 +160,7 @@ static void dev_map_free(struct bpf_map *map)
/* At this point bpf program is detached and all pending operations
* _must_ be complete
*/
free_percpu(dtab->flush_needed);
bpf_map_area_free(dtab->netdev_map);
kfree(dtab);
}
Expand All @@ -159,6 +183,14 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}

void __dev_map_insert_ctx(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);

__set_bit(key, bitmap);
}

struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
Expand All @@ -171,6 +203,39 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
return dev ? dev->dev : NULL;
}

/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
* net device can be torn down. On devmap tear down we ensure the ctx bitmap
* is zeroed before completing to ensure all flush operations have completed.
*/
void __dev_map_flush(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
u32 bit;

for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
struct net_device *netdev;

/* This is possible if the dev entry is removed by user space
* between xdp redirect and flush op.
*/
if (unlikely(!dev))
continue;

netdev = dev->dev;

__clear_bit(bit, bitmap);
if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush))
continue;

netdev->netdev_ops->ndo_xdp_flush(netdev);
}
}

/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
Expand All @@ -188,11 +253,28 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
return dev ? &dev->dev->ifindex : NULL;
}

static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev)
{
if (old_dev->dev->netdev_ops->ndo_xdp_flush) {
struct net_device *fl = old_dev->dev;
unsigned long *bitmap;
int cpu;

for_each_online_cpu(cpu) {
bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu);
__clear_bit(old_dev->key, bitmap);

fl->netdev_ops->ndo_xdp_flush(old_dev->dev);
}
}
}

static void __dev_map_entry_free(struct rcu_head *rcu)
{
struct bpf_dtab_netdev *old_dev;

old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
dev_map_flush_old(old_dev);
dev_put(old_dev->dev);
kfree(old_dev);
}
Expand Down
55 changes: 45 additions & 10 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -1780,6 +1780,7 @@ struct redirect_info {
u32 ifindex;
u32 flags;
struct bpf_map *map;
struct bpf_map *map_to_flush;
};

static DEFINE_PER_CPU(struct redirect_info, redirect_info);
Expand Down Expand Up @@ -2438,34 +2439,68 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};

static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp)
static int __bpf_tx_xdp(struct net_device *dev,
struct bpf_map *map,
struct xdp_buff *xdp,
u32 index)
{
if (dev->netdev_ops->ndo_xdp_xmit) {
dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
return 0;
int err;

if (!dev->netdev_ops->ndo_xdp_xmit) {
bpf_warn_invalid_xdp_redirect(dev->ifindex);
return -EOPNOTSUPP;
}
bpf_warn_invalid_xdp_redirect(dev->ifindex);
return -EOPNOTSUPP;

err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
if (err)
return err;

if (map)
__dev_map_insert_ctx(map, index);
else
dev->netdev_ops->ndo_xdp_flush(dev);

return err;
}

void xdp_do_flush_map(void)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
struct bpf_map *map = ri->map_to_flush;

ri->map = NULL;
ri->map_to_flush = NULL;

if (map)
__dev_map_flush(map);
}
EXPORT_SYMBOL_GPL(xdp_do_flush_map);

int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
struct bpf_map *map = ri->map;
u32 index = ri->ifindex;
struct net_device *fwd;
int err = -EINVAL;

ri->ifindex = 0;
ri->map = NULL;

fwd = __dev_map_lookup_elem(map, ri->ifindex);
fwd = __dev_map_lookup_elem(map, index);
if (!fwd)
goto out;

trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
err = __bpf_tx_xdp(fwd, xdp);
if (ri->map_to_flush && (ri->map_to_flush != map))
xdp_do_flush_map();

err = __bpf_tx_xdp(fwd, map, xdp, index);
if (likely(!err))
ri->map_to_flush = map;

out:
trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
return err;
}

Expand All @@ -2488,7 +2523,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,

trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);

return __bpf_tx_xdp(fwd, xdp);
return __bpf_tx_xdp(fwd, NULL, xdp, 0);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);

Expand Down

0 comments on commit 11393cc

Please sign in to comment.