From cfc7381b3002756b1dcada32979e942aa3126e31 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 15 Sep 2016 13:00:29 -0700
Subject: [PATCH 1/4] ip_tunnel: add collect_md mode to IPIP tunnel

Similar to gre, vxlan, geneve tunnels allow IPIP tunnels to
operate in 'collect metadata' mode.
bpf_skb_[gs]et_tunnel_key() helpers can make use of it right away.
ovs can use it as well in the future (once appropriate ovs-vport
abstractions and user apis are added).
Note that just like in other tunnels we cannot cache the dst,
since tunnel_info metadata can be different for every packet.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h       |  2 +
 include/uapi/linux/if_tunnel.h |  1 +
 net/ipv4/ip_tunnel.c           | 76 ++++++++++++++++++++++++++++++++++
 net/ipv4/ipip.c                | 35 +++++++++++++---
 4 files changed, 108 insertions(+), 6 deletions(-)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index e598c639aa6f0..59557c07904b4 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -255,6 +255,8 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
 
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+		       const u8 proto);
 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 9865c8caeddef..18d5dc13985dd 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -73,6 +73,7 @@ enum {
 	IFLA_IPTUN_ENCAP_FLAGS,
 	IFLA_IPTUN_ENCAP_SPORT,
 	IFLA_IPTUN_ENCAP_DPORT,
+	IFLA_IPTUN_COLLECT_METADATA,
 	__IFLA_IPTUN_MAX,
 };
 #define IFLA_IPTUN_MAX	(__IFLA_IPTUN_MAX - 1)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 95649ebd28743..5719d6ba0824b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/udp.h>
+#include <net/dst_metadata.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 	return 0;
 }
 
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	u32 headroom = sizeof(struct iphdr);
+	struct ip_tunnel_info *tun_info;
+	const struct ip_tunnel_key *key;
+	const struct iphdr *inner_iph;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	__be16 df = 0;
+	u8 tos, ttl;
+
+	tun_info = skb_tunnel_info(skb);
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+		     ip_tunnel_info_af(tun_info) != AF_INET))
+		goto tx_error;
+	key = &tun_info->key;
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+	tos = key->tos;
+	if (tos == 1) {
+		if (skb->protocol == htons(ETH_P_IP))
+			tos = inner_iph->tos;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+	}
+	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
+			 RT_TOS(tos), tunnel->parms.link);
+	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+		goto tx_error;
+	rt = ip_route_output_key(tunnel->net, &fl4);
+	if (IS_ERR(rt)) {
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+	if (rt->dst.dev == dev) {
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+	ttl = key->ttl;
+	if (ttl == 0) {
+		if (skb->protocol == htons(ETH_P_IP))
+			ttl = inner_iph->ttl;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+		else
+			ttl = ip4_dst_hoplimit(&rt->dst);
+	}
+	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+		df = htons(IP_DF);
+	else if (skb->protocol == htons(ETH_P_IP))
+		df = inner_iph->frag_off & htons(IP_DF);
+	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+	if (headroom > dev->needed_headroom)
+		dev->needed_headroom = headroom;
+
+	if (skb_cow_head(skb, dev->needed_headroom)) {
+		ip_rt_put(rt);
+		goto tx_dropped;
+	}
+	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+	return;
+tx_error:
+	dev->stats.tx_errors++;
+	goto kfree;
+tx_dropped:
+	dev->stats.tx_dropped++;
+kfree:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, u8 protocol)
 {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4ae3f8e6c6cc1..c9392589c4157 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -115,6 +115,7 @@
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/dst_metadata.h>
 
 static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
@@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+	struct metadata_dst *tun_dst = NULL;
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph;
 
@@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 			tpi = &ipip_tpi;
 		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 			goto drop;
-		return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		if (tunnel->collect_md) {
+			tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+			if (!tun_dst)
+				return 0;
+		}
+		return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 	}
 
 	return -1;
@@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
 
 	skb_set_inner_ipproto(skb, ipproto);
 
-	ip_tunnel_xmit(skb, dev, tiph, ipproto);
+	if (tunnel->collect_md)
+		ip_md_tunnel_xmit(skb, dev, ipproto);
+	else
+		ip_tunnel_xmit(skb, dev, tiph, ipproto);
 	return NETDEV_TX_OK;
 
 tx_error:
@@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
 }
 
 static void ipip_netlink_parms(struct nlattr *data[],
-			       struct ip_tunnel_parm *parms)
+			       struct ip_tunnel_parm *parms, bool *collect_md)
 {
 	memset(parms, 0, sizeof(*parms));
 
 	parms->iph.version = 4;
 	parms->iph.protocol = IPPROTO_IPIP;
 	parms->iph.ihl = 5;
+	*collect_md = false;
 
 	if (!data)
 		return;
@@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
 
 	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
 		parms->iph.frag_off = htons(IP_DF);
+
+	if (data[IFLA_IPTUN_COLLECT_METADATA])
+		*collect_md = true;
 }
 
 /* This function returns true when ENCAP attributes are present in the nl msg */
@@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
 static int ipip_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[])
 {
+	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
 
 	if (ipip_netlink_encap_parms(data, &ipencap)) {
-		struct ip_tunnel *t = netdev_priv(dev);
 		int err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
 			return err;
 	}
 
-	ipip_netlink_parms(data, &p);
+	ipip_netlink_parms(data, &p, &t->collect_md);
 	return ip_tunnel_newlink(dev, tb, &p);
 }
 
@@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 {
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
+	bool collect_md;
 
 	if (ipip_netlink_encap_parms(data, &ipencap)) {
 		struct ip_tunnel *t = netdev_priv(dev);
@@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 
-	ipip_netlink_parms(data, &p);
+	ipip_netlink_parms(data, &p, &collect_md);
+	if (collect_md)
+		return -EINVAL;
 
 	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
 	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_IPTUN_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_IPTUN_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			tunnel->encap.flags))
 		goto nla_put_failure;
 
+	if (tunnel->collect_md)
+		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+			goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ipip_link_ops __read_mostly = {

From 8d79266bc48c6ab6477d04e159cabf1e7809cb72 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 15 Sep 2016 13:00:30 -0700
Subject: [PATCH 2/4] ip6_tunnel: add collect_md mode to IPv6 tunnels

Similar to gre, vxlan, geneve tunnels allow IPIP6 and IP6IP6 tunnels
to operate in 'collect metadata' mode.
Unlike ipv4 code here it's possible to reuse ip6_tnl_xmit() function
for both collect_md and traditional tunnels.
bpf_skb_[gs]et_tunnel_key() helpers and ovs (in the future) are the users.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h |   1 +
 net/ipv6/ip6_tunnel.c    | 178 +++++++++++++++++++++++++++++----------
 2 files changed, 134 insertions(+), 45 deletions(-)

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 43a5a0e4524cb..20ed9699fcd40 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -23,6 +23,7 @@ struct __ip6_tnl_parm {
 	__u8 proto;		/* tunnel protocol */
 	__u8 encap_limit;	/* encapsulation limit for tunnel */
 	__u8 hop_limit;		/* hop limit for tunnel */
+	bool collect_md;
 	__be32 flowinfo;	/* traffic class and flowlabel for tunnel */
 	__u32 flags;		/* tunnel flags */
 	struct in6_addr laddr;	/* local tunnel end-point address */
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 5c5779720ef19..6a66adba0c229 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -57,6 +57,7 @@
 #include <net/inet_ecn.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/dst_metadata.h>
 
 MODULE_AUTHOR("Ville Nuorvala");
 MODULE_DESCRIPTION("IPv6 tunneling device");
@@ -90,6 +91,7 @@ struct ip6_tnl_net {
 	struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
 	struct ip6_tnl __rcu *tnls_wc[1];
 	struct ip6_tnl __rcu **tnls[2];
+	struct ip6_tnl __rcu *collect_md_tun;
 };
 
 static struct net_device_stats *ip6_get_stats(struct net_device *dev)
@@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
 			return t;
 	}
 
+	t = rcu_dereference(ip6n->collect_md_tun);
+	if (t)
+		return t;
+
 	t = rcu_dereference(ip6n->tnls_wc[0]);
 	if (t && (t->dev->flags & IFF_UP))
 		return t;
@@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 {
 	struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ip6n->collect_md_tun, t);
 	rcu_assign_pointer(t->next , rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
 }
@@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 	struct ip6_tnl __rcu **tp;
 	struct ip6_tnl *iter;
 
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ip6n->collect_md_tun, NULL);
+
 	for (tp = ip6_tnl_bucket(ip6n, &t->parms);
 	     (iter = rtnl_dereference(*tp)) != NULL;
 	     tp = &iter->next) {
@@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
 
 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 
+	if (tun_dst)
+		skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
 	gro_cells_receive(&tunnel->gro_cells, skb);
 	return 0;
 
@@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 {
 	struct ip6_tnl *t;
 	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct metadata_dst *tun_dst = NULL;
 	int ret = -1;
 
 	rcu_read_lock();
@@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 			goto drop;
 		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 			goto drop;
-		ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate,
+		if (t->parms.collect_md) {
+			tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
+			if (!tun_dst)
+				return 0;
+		}
+		ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
 				    log_ecn_error);
 	}
 
@@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	int mtu;
 	unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
 	unsigned int max_headroom = psh_hlen;
+	u8 hop_limit;
 	int err = -1;
 
+	if (t->parms.collect_md) {
+		hop_limit = skb_tunnel_info(skb)->key.ttl;
+		goto route_lookup;
+	} else {
+		hop_limit = t->parms.hop_limit;
+	}
+
 	/* NBMA tunnel */
 	if (ipv6_addr_any(&t->parms.raddr)) {
 		struct in6_addr *addr6;
@@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 		goto tx_err_link_failure;
 
 	if (!dst) {
+route_lookup:
 		dst = ip6_route_output(net, NULL, fl6);
 
 		if (dst->error)
@@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 			dst = NULL;
 			goto tx_err_link_failure;
 		}
+		if (t->parms.collect_md &&
+		    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+				       &fl6->daddr, 0, &fl6->saddr))
+			goto tx_err_link_failure;
 		ndst = dst;
 	}
 
@@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	}
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
-	if (skb_dst(skb))
+	if (skb_dst(skb) && !t->parms.collect_md)
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 	if (skb->len > mtu && !skb_is_gso(skb)) {
 		*pmtu = mtu;
@@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 		skb = new_skb;
 	}
 
-	if (!fl6->flowi6_mark && ndst)
-		dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+	if (t->parms.collect_md) {
+		if (t->encap.type != TUNNEL_ENCAP_NONE)
+			goto tx_err_dst_release;
+	} else {
+		if (!fl6->flowi6_mark && ndst)
+			dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+	}
 	skb_dst_set(skb, dst);
 
 	if (encap_limit >= 0) {
@@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	ipv6h = ipv6_hdr(skb);
 	ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
 		     ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
-	ipv6h->hop_limit = t->parms.hop_limit;
+	ipv6h->hop_limit = hop_limit;
 	ipv6h->nexthdr = proto;
 	ipv6h->saddr = fl6->saddr;
 	ipv6h->daddr = fl6->daddr;
@@ -1170,19 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (tproto != IPPROTO_IPIP && tproto != 0)
 		return -1;
 
-	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
+	dsfield = ipv4_get_dsfield(iph);
 
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-	fl6.flowi6_proto = IPPROTO_IPIP;
+	if (t->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
 
-	dsfield = ipv4_get_dsfield(iph);
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
+			return -1;
+		key = &tun_info->key;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPIP;
+		fl6.daddr = key->u.ipv6.dst;
+		fl6.flowlabel = key->label;
+	} else {
+		if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+			encap_limit = t->parms.encap_limit;
 
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
-					  & IPV6_TCLASS_MASK;
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
+		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPIP;
+
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+			fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+					 & IPV6_TCLASS_MASK;
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+			fl6.flowi6_mark = skb->mark;
+	}
 
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
 		return -1;
@@ -1220,29 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	    ip6_tnl_addr_conflict(t, ipv6h))
 		return -1;
 
-	offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
-	if (offset > 0) {
-		struct ipv6_tlv_tnl_enc_lim *tel;
-		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
-		if (tel->encap_limit == 0) {
-			icmpv6_send(skb, ICMPV6_PARAMPROB,
-				    ICMPV6_HDR_FIELD, offset + 2);
+	dsfield = ipv6_get_dsfield(ipv6h);
+
+	if (t->parms.collect_md) {
+		struct ip_tunnel_info *tun_info;
+		const struct ip_tunnel_key *key;
+
+		tun_info = skb_tunnel_info(skb);
+		if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+			     ip_tunnel_info_af(tun_info) != AF_INET6))
 			return -1;
+		key = &tun_info->key;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPV6;
+		fl6.daddr = key->u.ipv6.dst;
+		fl6.flowlabel = key->label;
+	} else {
+		offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+		if (offset > 0) {
+			struct ipv6_tlv_tnl_enc_lim *tel;
+
+			tel = (void *)&skb_network_header(skb)[offset];
+			if (tel->encap_limit == 0) {
+				icmpv6_send(skb, ICMPV6_PARAMPROB,
+					    ICMPV6_HDR_FIELD, offset + 2);
+				return -1;
+			}
+			encap_limit = tel->encap_limit - 1;
+		} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+			encap_limit = t->parms.encap_limit;
 		}
-		encap_limit = tel->encap_limit - 1;
-	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		encap_limit = t->parms.encap_limit;
 
-	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-	fl6.flowi6_proto = IPPROTO_IPV6;
+		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_IPV6;
 
-	dsfield = ipv6_get_dsfield(ipv6h);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-		fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
-		fl6.flowlabel |= ip6_flowlabel(ipv6h);
-	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-		fl6.flowi6_mark = skb->mark;
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+			fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+			fl6.flowlabel |= ip6_flowlabel(ipv6h);
+		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+			fl6.flowi6_mark = skb->mark;
+	}
 
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
 		return -1;
@@ -1741,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev)
 	if (err)
 		return err;
 	ip6_tnl_link_config(t);
+	if (t->parms.collect_md) {
+		dev->features |= NETIF_F_NETNS_LOCAL;
+		netif_keep_dst(dev);
+	}
 	return 0;
 }
 
@@ -1811,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
 
 	if (data[IFLA_IPTUN_PROTO])
 		parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
+	if (data[IFLA_IPTUN_COLLECT_METADATA])
+		parms->collect_md = true;
 }
 
 static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
@@ -1850,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[])
 {
 	struct net *net = dev_net(dev);
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 	struct ip6_tnl *nt, *t;
 	struct ip_tunnel_encap ipencap;
 
@@ -1864,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 
 	ip6_tnl_netlink_parms(data, &nt->parms);
 
-	t = ip6_tnl_locate(net, &nt->parms, 0);
-	if (!IS_ERR(t))
-		return -EEXIST;
+	if (nt->parms.collect_md) {
+		if (rtnl_dereference(ip6n->collect_md_tun))
+			return -EEXIST;
+	} else {
+		t = ip6_tnl_locate(net, &nt->parms, 0);
+		if (!IS_ERR(t))
+			return -EEXIST;
+	}
 
 	return ip6_tnl_create2(dev);
 }
@@ -1890,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 	ip6_tnl_netlink_parms(data, &p);
+	if (p.collect_md)
+		return -EINVAL;
 
 	t = ip6_tnl_locate(net, &p, 0);
 	if (!IS_ERR(t)) {
@@ -1937,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_IPTUN_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_IPTUN_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -1955,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
 		goto nla_put_failure;
 
-	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
-			tunnel->encap.type) ||
-	nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
-		     tunnel->encap.sport) ||
-	nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
-		     tunnel->encap.dport) ||
-	nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
-		    tunnel->encap.flags))
+	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
+	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
 		goto nla_put_failure;
 
+	if (parm->collect_md)
+		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+			goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -1992,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ip6_link_ops __read_mostly = {

From a1c82704d13fd0d0ab0eb10d33a9bb7af83c90e3 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 15 Sep 2016 13:00:31 -0700
Subject: [PATCH 3/4] samples/bpf: extend test_tunnel_bpf.sh with IPIP test

extend existing tests for vxlan, geneve, gre to include IPIP tunnel.
It tests both traditional tunnel configuration and
dynamic via bpf helpers.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/tcbpf2_kern.c      | 58 ++++++++++++++++++++++++++++++++++
 samples/bpf/test_tunnel_bpf.sh | 56 +++++++++++++++++++++++++++-----
 2 files changed, 106 insertions(+), 8 deletions(-)

diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
index 7a15289da6cc4..c1917d968fb4c 100644
--- a/samples/bpf/tcbpf2_kern.c
+++ b/samples/bpf/tcbpf2_kern.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2016 VMware
+ * Copyright (c) 2016 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -188,4 +189,61 @@ int _geneve_get_tunnel(struct __sk_buff *skb)
 	return TC_ACT_OK;
 }
 
+SEC("ipip_set_tunnel")
+int _ipip_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key = {};
+	void *data = (void *)(long)skb->data;
+	struct iphdr *iph = data;
+	struct tcphdr *tcp = data + sizeof(*iph);
+	void *data_end = (void *)(long)skb->data_end;
+	int ret;
+
+	/* single length check */
+	if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
+		ERROR(1);
+		return TC_ACT_SHOT;
+	}
+
+	key.tunnel_ttl = 64;
+	if (iph->protocol == IPPROTO_ICMP) {
+		key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+	} else {
+		if (iph->protocol != IPPROTO_TCP || iph->ihl != 5)
+			return TC_ACT_SHOT;
+
+		if (tcp->dest == htons(5200))
+			key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+		else if (tcp->dest == htons(5201))
+			key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */
+		else
+			return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ipip_get_tunnel")
+int _ipip_get_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	char fmt[] = "remote ip 0x%x\n";
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt), key.remote_ipv4);
+	return TC_ACT_OK;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
index 4956589a83aee..1ff634f187b7f 100755
--- a/samples/bpf/test_tunnel_bpf.sh
+++ b/samples/bpf/test_tunnel_bpf.sh
@@ -9,15 +9,13 @@
 # local 172.16.1.200 remote 172.16.1.100
 # veth1 IP: 172.16.1.200, tunnel dev <type>11
 
-set -e
-
 function config_device {
 	ip netns add at_ns0
 	ip link add veth0 type veth peer name veth1
 	ip link set veth0 netns at_ns0
 	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
 	ip netns exec at_ns0 ip link set dev veth0 up
-	ip link set dev veth1 up
+	ip link set dev veth1 up mtu 1500
 	ip addr add dev veth1 172.16.1.200/24
 }
 
@@ -67,6 +65,19 @@ function add_geneve_tunnel {
 	ip addr add dev $DEV 10.1.1.200/24
 }
 
+function add_ipip_tunnel {
+	# in namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE local 172.16.1.100 remote 172.16.1.200
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+
+	# out of namespace
+	ip link add dev $DEV type $TYPE external
+	ip link set dev $DEV up
+	ip addr add dev $DEV 10.1.1.200/24
+}
+
 function attach_bpf {
 	DEV=$1
 	SET_TUNNEL=$2
@@ -85,6 +96,7 @@ function test_gre {
 	attach_bpf $DEV gre_set_tunnel gre_get_tunnel
 	ping -c 1 10.1.1.100
 	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	cleanup
 }
 
 function test_vxlan {
@@ -96,6 +108,7 @@ function test_vxlan {
 	attach_bpf $DEV vxlan_set_tunnel vxlan_get_tunnel
 	ping -c 1 10.1.1.100
 	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	cleanup
 }
 
 function test_geneve {
@@ -107,21 +120,48 @@ function test_geneve {
 	attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel
 	ping -c 1 10.1.1.100
 	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	cleanup
+}
+
+function test_ipip {
+	TYPE=ipip
+	DEV_NS=ipip00
+	DEV=ipip11
+	config_device
+	tcpdump -nei veth1 &
+	cat /sys/kernel/debug/tracing/trace_pipe &
+	add_ipip_tunnel
+	ethtool -K veth1 gso off gro off rx off tx off
+	ip link set dev veth1 mtu 1500
+	attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
+	ping -c 1 10.1.1.100
+	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
+	sleep 0.2
+	iperf -c 10.1.1.100 -n 5k -p 5200
+	cleanup
 }
 
 function cleanup {
+	set +ex
+	pkill iperf
 	ip netns delete at_ns0
 	ip link del veth1
-	ip link del $DEV
+	ip link del ipip11
+	ip link del gretap11
+	ip link del geneve11
+	pkill tcpdump
+	pkill cat
+	set -ex
 }
 
+cleanup
 echo "Testing GRE tunnel..."
 test_gre
-cleanup
 echo "Testing VXLAN tunnel..."
 test_vxlan
-cleanup
 echo "Testing GENEVE tunnel..."
 test_geneve
-cleanup
-echo "Success"
+echo "Testing IPIP tunnel..."
+test_ipip
+echo "*** PASS ***"

From 173ca26e9b5136faa82dee37c77cbfb36974d079 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 15 Sep 2016 13:00:32 -0700
Subject: [PATCH 4/4] samples/bpf: add comprehensive ipip, ipip6, ip6ip6 test

the test creates 3 namespaces with veth connected via bridge.
First two namespaces simulate two different hosts with the same
IPv4 and IPv6 addresses configured on the tunnel interface and they
communicate with outside world via standard tunnels.
Third namespace creates collect_md tunnel that is driven by BPF
program which selects different remote host (either first or
second namespace) based on tcp dest port number while tcp dst
ip is the same.
This scenario is rough approximation of load balancer use case.
The tests check both traditional tunnel configuration and collect_md mode.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/tcbpf2_kern.c | 132 ++++++++++++++++++++++++++++
 samples/bpf/test_ipip.sh  | 178 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 310 insertions(+)
 create mode 100755 samples/bpf/test_ipip.sh

diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
index c1917d968fb4c..3303bb85593bc 100644
--- a/samples/bpf/tcbpf2_kern.c
+++ b/samples/bpf/tcbpf2_kern.c
@@ -9,12 +9,15 @@
 #include <uapi/linux/if_ether.h>
 #include <uapi/linux/if_packet.h>
 #include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
 #include <uapi/linux/in.h>
 #include <uapi/linux/tcp.h>
 #include <uapi/linux/filter.h>
 #include <uapi/linux/pkt_cls.h>
+#include <net/ipv6.h>
 #include "bpf_helpers.h"
 
+#define _htonl __builtin_bswap32
 #define ERROR(ret) do {\
 		char fmt[] = "ERROR line:%d ret:%d\n";\
 		bpf_trace_printk(fmt, sizeof(fmt), __LINE__, ret); \
@@ -246,4 +249,133 @@ int _ipip_get_tunnel(struct __sk_buff *skb)
 	return TC_ACT_OK;
 }
 
+SEC("ipip6_set_tunnel")
+int _ipip6_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key = {};
+	void *data = (void *)(long)skb->data;
+	struct iphdr *iph = data;
+	struct tcphdr *tcp = data + sizeof(*iph);
+	void *data_end = (void *)(long)skb->data_end;
+	int ret;
+
+	/* single length check */
+	if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
+		ERROR(1);
+		return TC_ACT_SHOT;
+	}
+
+	key.remote_ipv6[0] = _htonl(0x2401db00);
+	key.tunnel_ttl = 64;
+
+	if (iph->protocol == IPPROTO_ICMP) {
+		key.remote_ipv6[3] = _htonl(1);
+	} else {
+		if (iph->protocol != IPPROTO_TCP || iph->ihl != 5) {
+			ERROR(iph->protocol);
+			return TC_ACT_SHOT;
+		}
+
+		if (tcp->dest == htons(5200)) {
+			key.remote_ipv6[3] = _htonl(1);
+		} else if (tcp->dest == htons(5201)) {
+			key.remote_ipv6[3] = _htonl(2);
+		} else {
+			ERROR(tcp->dest);
+			return TC_ACT_SHOT;
+		}
+	}
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ipip6_get_tunnel")
+int _ipip6_get_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	char fmt[] = "remote ip6 %x::%x\n";
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]),
+			 _htonl(key.remote_ipv6[3]));
+	return TC_ACT_OK;
+}
+
+SEC("ip6ip6_set_tunnel")
+int _ip6ip6_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key = {};
+	void *data = (void *)(long)skb->data;
+	struct ipv6hdr *iph = data;
+	struct tcphdr *tcp = data + sizeof(*iph);
+	void *data_end = (void *)(long)skb->data_end;
+	int ret;
+
+	/* single length check */
+	if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
+		ERROR(1);
+		return TC_ACT_SHOT;
+	}
+
+	key.remote_ipv6[0] = _htonl(0x2401db00);
+	key.tunnel_ttl = 64;
+
+	if (iph->nexthdr == NEXTHDR_ICMP) {
+		key.remote_ipv6[3] = _htonl(1);
+	} else {
+		if (iph->nexthdr != NEXTHDR_TCP) {
+			ERROR(iph->nexthdr);
+			return TC_ACT_SHOT;
+		}
+
+		if (tcp->dest == htons(5200)) {
+			key.remote_ipv6[3] = _htonl(1);
+		} else if (tcp->dest == htons(5201)) {
+			key.remote_ipv6[3] = _htonl(2);
+		} else {
+			ERROR(tcp->dest);
+			return TC_ACT_SHOT;
+		}
+	}
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip6ip6_get_tunnel")
+int _ip6ip6_get_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	char fmt[] = "remote ip6 %x::%x\n";
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]),
+			 _htonl(key.remote_ipv6[3]));
+	return TC_ACT_OK;
+}
+
+
 char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_ipip.sh b/samples/bpf/test_ipip.sh
new file mode 100755
index 0000000000000..196925403ab4f
--- /dev/null
+++ b/samples/bpf/test_ipip.sh
@@ -0,0 +1,178 @@
+#!/bin/bash
+
+function config_device {
+	ip netns add at_ns0
+	ip netns add at_ns1
+	ip netns add at_ns2
+	ip link add veth0 type veth peer name veth0b
+	ip link add veth1 type veth peer name veth1b
+	ip link add veth2 type veth peer name veth2b
+	ip link set veth0b up
+	ip link set veth1b up
+	ip link set veth2b up
+	ip link set dev veth0b mtu 1500
+	ip link set dev veth1b mtu 1500
+	ip link set dev veth2b mtu 1500
+	ip link set veth0 netns at_ns0
+	ip link set veth1 netns at_ns1
+	ip link set veth2 netns at_ns2
+	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+	ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1
+	ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad
+	ip netns exec at_ns1 ip link set dev veth1 up
+	ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2
+	ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad
+	ip netns exec at_ns2 ip link set dev veth2 up
+	ip link add br0 type bridge
+	ip link set br0 up
+	ip link set dev br0 mtu 1500
+	ip link set veth0b master br0
+	ip link set veth1b master br0
+	ip link set veth2b master br0
+}
+
+function add_ipip_tunnel {
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns1 \
+		ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200
+	ip netns exec at_ns1 ip link set dev $DEV_NS up
+	# same inner IP address in at_ns0 and at_ns1
+	ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24
+
+	ip netns exec at_ns2 ip link add dev $DEV type ipip external
+	ip netns exec at_ns2 ip link set dev $DEV up
+	ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24
+}
+
+function add_ipip6_tunnel {
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns1 \
+		ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64
+	ip netns exec at_ns1 ip link set dev $DEV_NS up
+	# same inner IP address in at_ns0 and at_ns1
+	ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24
+
+	ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external
+	ip netns exec at_ns2 ip link set dev $DEV up
+	ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24
+}
+
+function add_ip6ip6_tunnel {
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64
+	ip netns exec at_ns1 \
+		ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64
+	ip netns exec at_ns1 ip link set dev $DEV_NS up
+	# same inner IP address in at_ns0 and at_ns1
+	ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64
+
+	ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external
+	ip netns exec at_ns2 ip link set dev $DEV up
+	ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64
+}
+
+function attach_bpf {
+	DEV=$1
+	SET_TUNNEL=$2
+	GET_TUNNEL=$3
+	ip netns exec at_ns2 tc qdisc add dev $DEV clsact
+	ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL
+	ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL
+}
+
+function test_ipip {
+	DEV_NS=ipip_std
+	DEV=ipip_bpf
+	config_device
+#	tcpdump -nei br0 &
+	cat /sys/kernel/debug/tracing/trace_pipe &
+
+	add_ipip_tunnel
+	attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
+
+	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	ip netns exec at_ns2 ping -c 1 10.1.1.100
+	ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
+	ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null
+	sleep 0.2
+	# tcp check _same_ IP over different tunnels
+	ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200
+	ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201
+	cleanup
+}
+
+# IPv4 over IPv6 tunnel
+function test_ipip6 {
+	DEV_NS=ipip_std
+	DEV=ipip_bpf
+	config_device
+#	tcpdump -nei br0 &
+	cat /sys/kernel/debug/tracing/trace_pipe &
+
+	add_ipip6_tunnel
+	attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel
+
+	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	ip netns exec at_ns2 ping -c 1 10.1.1.100
+	ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
+	ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null
+	sleep 0.2
+	# tcp check _same_ IP over different tunnels
+	ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200
+	ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201
+	cleanup
+}
+
+# IPv6 over IPv6 tunnel
+function test_ip6ip6 {
+	DEV_NS=ipip_std
+	DEV=ipip_bpf
+	config_device
+#	tcpdump -nei br0 &
+	cat /sys/kernel/debug/tracing/trace_pipe &
+
+	add_ip6ip6_tunnel
+	attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel
+
+	ip netns exec at_ns0 ping -6 -c 1 2601:646::2
+	ip netns exec at_ns2 ping -6 -c 1 2601:646::1
+	ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null
+	ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null
+	sleep 0.2
+	# tcp check _same_ IP over different tunnels
+	ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200
+	ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201
+	cleanup
+}
+
+function cleanup {
+	set +ex
+	pkill iperf
+	ip netns delete at_ns0
+	ip netns delete at_ns1
+	ip netns delete at_ns2
+	ip link del veth0
+	ip link del veth1
+	ip link del veth2
+	ip link del br0
+	pkill tcpdump
+	pkill cat
+	set -ex
+}
+
+cleanup
+echo "Testing IP tunnels..."
+test_ipip
+test_ipip6
+test_ip6ip6
+echo "*** PASS ***"