Skip to content

Commit

Permalink
Merge branch 'tap-XDP-support'
Browse files Browse the repository at this point in the history
Jason Wang says:

====================
XDP support for tap

This series tries to implement XDP support for tap. Two path were
implemented:

- fast path: small & non-gso packet, For performance reason we do it
  at page level and use build_skb() to create skb if necessary.
- slow path: big or gso packet, we don't want to lose the capability
  compared to generic XDP, so we export some generic xdp helpers and
  do it after skb was created.

xdp1 shows about 41% improvement, xdp_redirect shows about 60%
improvement.

Changes from V1:
- fix the race between xdp set and free
- don't hold extra refcount
- add XDP_REDIRECT support

Please review.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Aug 14, 2017
2 parents d022578 + 761876c commit cc8f1a3
Show file tree
Hide file tree
Showing 3 changed files with 236 additions and 27 deletions.
247 changes: 226 additions & 21 deletions drivers/net/tun.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>

#include <linux/uaccess.h>

Expand Down Expand Up @@ -105,6 +107,9 @@ do { \
} while (0)
#endif

#define TUN_HEADROOM 256
#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD + TUN_HEADROOM)

/* TUN device flags */

/* IFF_ATTACH_QUEUE is never stored in device flags,
Expand Down Expand Up @@ -170,6 +175,7 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
struct page_frag alloc_frag;
};

struct tun_flow_entry {
Expand Down Expand Up @@ -221,6 +227,7 @@ struct tun_struct {
u32 flow_count;
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
struct bpf_prog __rcu *xdp_prog;
};

#ifdef CONFIG_TUN_VNET_CROSS_LE
Expand Down Expand Up @@ -571,6 +578,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
}
if (tun)
skb_array_cleanup(&tfile->tx_array);
if (tfile->alloc_frag.page)
put_page(tfile->alloc_frag.page);
sock_put(&tfile->sk);
}
}
Expand All @@ -585,6 +594,7 @@ static void tun_detach(struct tun_file *tfile, bool clean)
static void tun_detach_all(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog);
struct tun_file *tfile, *tmp;
int i, n = tun->numqueues;

Expand Down Expand Up @@ -617,6 +627,9 @@ static void tun_detach_all(struct net_device *dev)
}
BUG_ON(tun->numdisabled != 0);

if (xdp_prog)
bpf_prog_put(xdp_prog);

if (tun->flags & IFF_PERSIST)
module_put(THIS_MODULE);
}
Expand Down Expand Up @@ -1003,6 +1016,46 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
stats->tx_dropped = tx_dropped;
}

static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
struct tun_struct *tun = netdev_priv(dev);
struct bpf_prog *old_prog;

old_prog = rtnl_dereference(tun->xdp_prog);
rcu_assign_pointer(tun->xdp_prog, prog);
if (old_prog)
bpf_prog_put(old_prog);

return 0;
}

static u32 tun_xdp_query(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
const struct bpf_prog *xdp_prog;

xdp_prog = rtnl_dereference(tun->xdp_prog);
if (xdp_prog)
return xdp_prog->aux->id;

return 0;
}

static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
switch (xdp->command) {
case XDP_SETUP_PROG:
return tun_xdp_set(dev, xdp->prog, xdp->extack);
case XDP_QUERY_PROG:
xdp->prog_id = tun_xdp_query(dev);
xdp->prog_attached = !!xdp->prog_id;
return 0;
default:
return -EINVAL;
}
}

static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
Expand Down Expand Up @@ -1033,6 +1086,7 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
.ndo_xdp = tun_xdp,
};

static void tun_flow_init(struct tun_struct *tun)
Expand Down Expand Up @@ -1190,6 +1244,128 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
}
}

static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
int len, int noblock, bool zerocopy)
{
if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
return false;

if (tfile->socket.sk->sk_sndbuf != INT_MAX)
return false;

if (!noblock)
return false;

if (zerocopy)
return false;

if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
return false;

return true;
}

static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct tun_file *tfile,
struct iov_iter *from,
struct virtio_net_hdr *hdr,
int len, int *generic_xdp)
{
struct page_frag *alloc_frag = &tfile->alloc_frag;
struct sk_buff *skb;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
unsigned int delta = 0;
char *buf;
size_t copied;
bool xdp_xmit = false;
int err;

if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
return ERR_PTR(-ENOMEM);

buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
copied = copy_page_from_iter(alloc_frag->page,
alloc_frag->offset + TUN_RX_PAD,
len, from);
if (copied != len)
return ERR_PTR(-EFAULT);

if (hdr->gso_type)
*generic_xdp = 1;
else
*generic_xdp = 0;

rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog && !*generic_xdp) {
struct xdp_buff xdp;
void *orig_data;
u32 act;

xdp.data_hard_start = buf;
xdp.data = buf + TUN_RX_PAD;
xdp.data_end = xdp.data + len;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);

switch (act) {
case XDP_REDIRECT:
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
if (err)
goto err_redirect;
return NULL;
case XDP_TX:
xdp_xmit = true;
/* fall through */
case XDP_PASS:
delta = orig_data - xdp.data;
break;
default:
bpf_warn_invalid_xdp_action(act);
/* fall through */
case XDP_ABORTED:
trace_xdp_exception(tun->dev, xdp_prog, act);
/* fall through */
case XDP_DROP:
goto err_xdp;
}
}

skb = build_skb(buf, buflen);
if (!skb) {
rcu_read_unlock();
return ERR_PTR(-ENOMEM);
}

skb_reserve(skb, TUN_RX_PAD - delta);
skb_put(skb, len + delta);
get_page(alloc_frag->page);
alloc_frag->offset += buflen;

if (xdp_xmit) {
skb->dev = tun->dev;
generic_xdp_tx(skb, xdp_prog);
rcu_read_lock();
return NULL;
}

rcu_read_unlock();

return skb;

err_redirect:
put_page(alloc_frag->page);
err_xdp:
rcu_read_unlock();
this_cpu_inc(tun->pcpu_stats->rx_dropped);
return NULL;
}

/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
Expand All @@ -1206,6 +1382,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
bool zerocopy = false;
int err;
u32 rxhash;
int generic_xdp = 1;

if (!(tun->dev->flags & IFF_UP))
return -EIO;
Expand Down Expand Up @@ -1263,30 +1440,40 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
zerocopy = true;
}

if (!zerocopy) {
copylen = len;
if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
linear = good_linear;
else
linear = tun16_to_cpu(tun, gso.hdr_len);
}

skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN)
if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
if (IS_ERR(skb)) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
return PTR_ERR(skb);
}
return PTR_ERR(skb);
}
if (!skb)
return total_len;
} else {
if (!zerocopy) {
copylen = len;
if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
linear = good_linear;
else
linear = tun16_to_cpu(tun, gso.hdr_len);
}

if (zerocopy)
err = zerocopy_sg_from_iter(skb, from);
else
err = skb_copy_datagram_from_iter(skb, 0, from, len);
skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN)
this_cpu_inc(tun->pcpu_stats->rx_dropped);
return PTR_ERR(skb);
}

if (err) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
return -EFAULT;
if (zerocopy)
err = zerocopy_sg_from_iter(skb, from);
else
err = skb_copy_datagram_from_iter(skb, 0, from, len);

if (err) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
return -EFAULT;
}
}

if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
Expand Down Expand Up @@ -1334,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
skb_reset_network_header(skb);
skb_probe_transport_header(skb, 0);

if (generic_xdp) {
struct bpf_prog *xdp_prog;
int ret;

rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog) {
ret = do_xdp_generic(xdp_prog, skb);
if (ret != XDP_PASS) {
rcu_read_unlock();
return total_len;
}
}
rcu_read_unlock();
}

rxhash = __skb_get_hash_symmetric(skb);
#ifndef CONFIG_4KSTACKS
tun_rx_batched(tun, tfile, skb, more);
Expand Down Expand Up @@ -2377,6 +2580,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
tfile->sk.sk_write_space = tun_sock_write_space;
tfile->sk.sk_sndbuf = INT_MAX;

tfile->alloc_frag.page = NULL;

file->private_data = tfile;
INIT_LIST_HEAD(&tfile->next);

Expand Down
2 changes: 2 additions & 0 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -3243,6 +3243,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
}

void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
Expand Down
Loading

0 comments on commit cc8f1a3

Please sign in to comment.