From 66ccbc9c87c2ae96585f9aa5f733609e6f4acf25 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 11 Aug 2017 19:41:16 +0800 Subject: [PATCH 1/3] tap: use build_skb() for small packet We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate skb in the past. This socket based method is not suitable for high speed userspace like virtualization which usually: - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as possible - don't want to be block at sendmsg() To eliminate the above overheads, this patch tries to use build_skb() for small packet. We will do this only when the following conditions are all met: - TAP instead of TUN - sk_sndbuf is INT_MAX - caller don't want to be blocked - zerocopy is not used - packet size is smaller enough to use build_skb() Pktgen from guest to host shows ~11% improvement for rx pps of tap: Before: ~1.70Mpps After : ~1.88Mpps What's more important, this makes it possible to implement XDP for tap before creating skbs. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 112 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 91 insertions(+), 21 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d21510d47aa22..9736df40d2bfd 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -105,6 +105,8 @@ do { \ } while (0) #endif +#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) + /* TUN device flags */ /* IFF_ATTACH_QUEUE is never stored in device flags, @@ -170,6 +172,7 @@ struct tun_file { struct list_head next; struct tun_struct *detached; struct skb_array tx_array; + struct page_frag alloc_frag; }; struct tun_flow_entry { @@ -571,6 +574,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean) } if (tun) skb_array_cleanup(&tfile->tx_array); + if (tfile->alloc_frag.page) + put_page(tfile->alloc_frag.page); sock_put(&tfile->sk); } } @@ -1190,6 +1195,61 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, } } +static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, + int len, int noblock, bool zerocopy) +{ + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) + return false; + + if (tfile->socket.sk->sk_sndbuf != INT_MAX) + return false; + + if (!noblock) + return false; + + if (zerocopy) + return false; + + if (SKB_DATA_ALIGN(len + TUN_RX_PAD) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) + return false; + + return true; +} + +static struct sk_buff *tun_build_skb(struct tun_file *tfile, + struct iov_iter *from, + int len) +{ + struct page_frag *alloc_frag = &tfile->alloc_frag; + struct sk_buff *skb; + int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + char *buf; + size_t copied; + + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) + return ERR_PTR(-ENOMEM); + + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; + copied = copy_page_from_iter(alloc_frag->page, + alloc_frag->offset + TUN_RX_PAD, + len, from); + if (copied != len) + return ERR_PTR(-EFAULT); + + skb = build_skb(buf, buflen); + if (!skb) + return ERR_PTR(-ENOMEM); + + skb_reserve(skb, TUN_RX_PAD); + skb_put(skb, len); + get_page(alloc_frag->page); + alloc_frag->offset += buflen; + + return skb; +} + /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, @@ -1263,30 +1323,38 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, zerocopy = true; } - if (!zerocopy) { - copylen = len; - if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) - linear = good_linear; - else - linear = tun16_to_cpu(tun, gso.hdr_len); - } - - skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); - if (IS_ERR(skb)) { - if (PTR_ERR(skb) != -EAGAIN) + if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { + skb = tun_build_skb(tfile, from, len); + if (IS_ERR(skb)) { this_cpu_inc(tun->pcpu_stats->rx_dropped); - return PTR_ERR(skb); - } + return PTR_ERR(skb); + } + } else { + if (!zerocopy) { + copylen = len; + if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) + linear = good_linear; + else + linear = tun16_to_cpu(tun, gso.hdr_len); + } - if (zerocopy) - err = zerocopy_sg_from_iter(skb, from); - else - err = skb_copy_datagram_from_iter(skb, 0, from, len); + skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) != -EAGAIN) + this_cpu_inc(tun->pcpu_stats->rx_dropped); + return PTR_ERR(skb); + } - if (err) { - this_cpu_inc(tun->pcpu_stats->rx_dropped); - kfree_skb(skb); - return -EFAULT; + if (zerocopy) + err = zerocopy_sg_from_iter(skb, from); + else + err = skb_copy_datagram_from_iter(skb, 0, from, len); + + if (err) { + this_cpu_inc(tun->pcpu_stats->rx_dropped); + kfree_skb(skb); + return -EFAULT; + } } if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { @@ -2377,6 +2445,8 @@ static int tun_chr_open(struct inode *inode, struct file * file) tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; + tfile->alloc_frag.page = NULL; + file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); From 7c4974786f4794178f04e96318fc3b2f2850cbc6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 11 Aug 2017 19:41:17 +0800 Subject: [PATCH 2/3] net: export some generic xdp helpers This patch tries to export some generic xdp helpers to drivers. This can let driver to do XDP for a specific skb. This is useful for the case when the packet is hard to be processed at page level directly (e.g jumbo/GSO frame). With this patch, there's no need for driver to forbid the XDP set when configuration is not suitable. Instead, it can defer the XDP for packets that is hard to be processed directly after skb is created. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 14 ++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d238d54c4840..0f1c4cb2441ee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3243,6 +3243,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED); } +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1024d3741d128..40b28e417072e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3919,7 +3919,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. */ -static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -3940,13 +3940,12 @@ static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) kfree_skb(skb); } } +EXPORT_SYMBOL_GPL(generic_xdp_tx); static struct static_key generic_xdp_needed __read_mostly; -static int do_xdp_generic(struct sk_buff *skb) +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { - struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); - if (xdp_prog) { u32 act = netif_receive_generic_xdp(skb, xdp_prog); int err; @@ -3971,6 +3970,7 @@ static int do_xdp_generic(struct sk_buff *skb) kfree_skb(skb); return XDP_DROP; } +EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { @@ -3981,7 +3981,8 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(skb); + int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + skb); /* Consider XDP consuming the packet a success from * the netdev point of view we do not want to count @@ -4502,7 +4503,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(skb); + int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + skb); if (ret != XDP_PASS) { rcu_read_unlock(); From 761876c857cb2ef8489fbee01907151da902af91 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 11 Aug 2017 19:41:18 +0800 Subject: [PATCH 3/3] tap: XDP support This patch tries to implement XDP for tun. The implementation was split into two parts: - fast path: small and no gso packet. We try to do XDP at page level before build_skb(). For XDP_TX, since creating/destroying queues were completely under control of userspace, it was implemented through generic XDP helper after skb has been built. This could be optimized in the future. - slow path: big or gso packet. We try to do it after skb was created through generic XDP helpers. Test were done through pktgen with small packets. xdp1 test shows ~41.1% improvement: Before: ~1.7Mpps After: ~2.3Mpps xdp_redirect to ixgbe shows ~60% improvement: Before: ~0.8Mpps After: ~1.38Mpps Suggested-by: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 149 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 142 insertions(+), 7 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9736df40d2bfd..5892284eb8d05 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -73,6 +73,8 @@ #include #include #include +#include +#include #include @@ -105,7 +107,8 @@ do { \ } while (0) #endif -#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) +#define TUN_HEADROOM 256 +#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD + TUN_HEADROOM) /* TUN device flags */ @@ -224,6 +227,7 @@ struct tun_struct { u32 flow_count; u32 rx_batched; struct tun_pcpu_stats __percpu *pcpu_stats; + struct bpf_prog __rcu *xdp_prog; }; #ifdef CONFIG_TUN_VNET_CROSS_LE @@ -590,6 +594,7 @@ static void tun_detach(struct tun_file *tfile, bool clean) static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); + struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; @@ -622,6 +627,9 @@ static void tun_detach_all(struct net_device *dev) } BUG_ON(tun->numdisabled != 0); + if (xdp_prog) + bpf_prog_put(xdp_prog); + if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } @@ -1008,6 +1016,46 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) stats->tx_dropped = tx_dropped; } +static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) +{ + struct tun_struct *tun = netdev_priv(dev); + struct bpf_prog *old_prog; + + old_prog = rtnl_dereference(tun->xdp_prog); + rcu_assign_pointer(tun->xdp_prog, prog); + if (old_prog) + bpf_prog_put(old_prog); + + return 0; +} + +static u32 tun_xdp_query(struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + const struct bpf_prog *xdp_prog; + + xdp_prog = rtnl_dereference(tun->xdp_prog); + if (xdp_prog) + return xdp_prog->aux->id; + + return 0; +} + +static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return tun_xdp_set(dev, xdp->prog, xdp->extack); + case XDP_QUERY_PROG: + xdp->prog_id = tun_xdp_query(dev); + xdp->prog_attached = !!xdp->prog_id; + return 0; + default: + return -EINVAL; + } +} + static const struct net_device_ops tun_netdev_ops = { .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, @@ -1038,6 +1086,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, + .ndo_xdp = tun_xdp, }; static void tun_flow_init(struct tun_struct *tun) @@ -1217,16 +1266,22 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, return true; } -static struct sk_buff *tun_build_skb(struct tun_file *tfile, +static struct sk_buff *tun_build_skb(struct tun_struct *tun, + struct tun_file *tfile, struct iov_iter *from, - int len) + struct virtio_net_hdr *hdr, + int len, int *generic_xdp) { struct page_frag *alloc_frag = &tfile->alloc_frag; struct sk_buff *skb; + struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + unsigned int delta = 0; char *buf; size_t copied; + bool xdp_xmit = false; + int err; if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) return ERR_PTR(-ENOMEM); @@ -1238,16 +1293,77 @@ static struct sk_buff *tun_build_skb(struct tun_file *tfile, if (copied != len) return ERR_PTR(-EFAULT); + if (hdr->gso_type) + *generic_xdp = 1; + else + *generic_xdp = 0; + + rcu_read_lock(); + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog && !*generic_xdp) { + struct xdp_buff xdp; + void *orig_data; + u32 act; + + xdp.data_hard_start = buf; + xdp.data = buf + TUN_RX_PAD; + xdp.data_end = xdp.data + len; + orig_data = xdp.data; + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + switch (act) { + case XDP_REDIRECT: + get_page(alloc_frag->page); + alloc_frag->offset += buflen; + err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); + if (err) + goto err_redirect; + return NULL; + case XDP_TX: + xdp_xmit = true; + /* fall through */ + case XDP_PASS: + delta = orig_data - xdp.data; + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(tun->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + goto err_xdp; + } + } + skb = build_skb(buf, buflen); - if (!skb) + if (!skb) { + rcu_read_unlock(); return ERR_PTR(-ENOMEM); + } - skb_reserve(skb, TUN_RX_PAD); - skb_put(skb, len); + skb_reserve(skb, TUN_RX_PAD - delta); + skb_put(skb, len + delta); get_page(alloc_frag->page); alloc_frag->offset += buflen; + if (xdp_xmit) { + skb->dev = tun->dev; + generic_xdp_tx(skb, xdp_prog); + rcu_read_lock(); + return NULL; + } + + rcu_read_unlock(); + return skb; + +err_redirect: + put_page(alloc_frag->page); +err_xdp: + rcu_read_unlock(); + this_cpu_inc(tun->pcpu_stats->rx_dropped); + return NULL; } /* Get packet from user space buffer */ @@ -1266,6 +1382,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, bool zerocopy = false; int err; u32 rxhash; + int generic_xdp = 1; if (!(tun->dev->flags & IFF_UP)) return -EIO; @@ -1324,11 +1441,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { - skb = tun_build_skb(tfile, from, len); + skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp); if (IS_ERR(skb)) { this_cpu_inc(tun->pcpu_stats->rx_dropped); return PTR_ERR(skb); } + if (!skb) + return total_len; } else { if (!zerocopy) { copylen = len; @@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); + if (generic_xdp) { + struct bpf_prog *xdp_prog; + int ret; + + rcu_read_lock(); + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog) { + ret = do_xdp_generic(xdp_prog, skb); + if (ret != XDP_PASS) { + rcu_read_unlock(); + return total_len; + } + } + rcu_read_unlock(); + } + rxhash = __skb_get_hash_symmetric(skb); #ifndef CONFIG_4KSTACKS tun_rx_batched(tun, tfile, skb, more);