From efdfa2f7848f64517008136fb41f53c4a1faf93a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 11 Nov 2015 23:25:40 +0100
Subject: [PATCH 1/5] packet: do skb_probe_transport_header when we actually
 have data

In tpacket_fill_skb() commit c1aad275b029 ("packet: set transport
header before doing xmit") and later on 40893fd0fd4e ("net: switch
to use skb_probe_transport_header()") was probing for a transport
header on the skb from a ring buffer slot, but at a time, where
the skb has _not even_ been filled with data yet. So that call into
the flow dissector is pretty useless. Lets do it after we've set
up the skb frags.

Fixes: c1aad275b029 ("packet: set transport header before doing xmit")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index af399cac52054..80c36c0867d3a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2368,8 +2368,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	skb_reserve(skb, hlen);
 	skb_reset_network_header(skb);
 
-	if (!packet_use_direct_xmit(po))
-		skb_probe_transport_header(skb, 0);
 	if (unlikely(po->tp_tx_has_off)) {
 		int off_min, off_max, off;
 		off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
@@ -2449,6 +2447,9 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		len = ((to_write > len_max) ? len_max : to_write);
 	}
 
+	if (!packet_use_direct_xmit(po))
+		skb_probe_transport_header(skb, 0);
+
 	return tp_len;
 }
 

From 8fd6c80d9dd938ca338c70698533a7e304752846 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 11 Nov 2015 23:25:41 +0100
Subject: [PATCH 2/5] packet: always probe for transport header

We concluded that the skb_probe_transport_header() should better be
called unconditionally. Avoiding the call into the flow dissector has
also not really much to do with the direct xmit mode.

While it seems that only virtio_net code makes use of GSO from non
RX/TX ring packet socket paths, we should probe for a transport header
nevertheless before they hit devices.

Reference: http://thread.gmane.org/gmane.linux.network/386173/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 80c36c0867d3a..bdecf17a15bb4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2447,8 +2447,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		len = ((to_write > len_max) ? len_max : to_write);
 	}
 
-	if (!packet_use_direct_xmit(po))
-		skb_probe_transport_header(skb, 0);
+	skb_probe_transport_header(skb, 0);
 
 	return tp_len;
 }
@@ -2808,8 +2807,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		len += vnet_hdr_len;
 	}
 
-	if (!packet_use_direct_xmit(po))
-		skb_probe_transport_header(skb, reserve);
+	skb_probe_transport_header(skb, reserve);
+
 	if (unlikely(extra_len == 4))
 		skb->no_fcs = 1;
 

From 3c70c132488794e2489ab045559b0ce0afcf17de Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 11 Nov 2015 23:25:42 +0100
Subject: [PATCH 3/5] packet: only allow extra vlan len on ethernet devices

Packet sockets can be used by various net devices and are not
really restricted to ARPHRD_ETHER device types. However, when
currently checking for the extra 4 bytes that can be transmitted
in VLAN case, our assumption is that we generally probe on
ARPHRD_ETHER devices. Therefore, before looking into Ethernet
header, check the device type first.

This also fixes the issue where non-ARPHRD_ETHER devices could
have no dev->hard_header_len in TX_RING SOCK_RAW case, and thus
the check would test unfilled linear part of the skb (instead
of non-linear).

Fixes: 57f89bfa2140 ("network: Allow af_packet to transmit +4 bytes for VLAN packets.")
Fixes: 52f1454f629f ("packet: allow to transmit +4 byte in TX_RING slot for VLAN case")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 60 ++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 35 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bdecf17a15bb4..8795b0fb1ed1c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1741,6 +1741,20 @@ static void fanout_release(struct sock *sk)
 		kfree_rcu(po->rollover, rcu);
 }
 
+static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
+					  struct sk_buff *skb)
+{
+	/* Earlier code assumed this would be a VLAN pkt, double-check
+	 * this now that we have the actual packet in hand. We can only
+	 * do this check on Ethernet devices.
+	 */
+	if (unlikely(dev->type != ARPHRD_ETHER))
+		return false;
+
+	skb_reset_mac_header(skb);
+	return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
+}
+
 static const struct proto_ops packet_ops;
 
 static const struct proto_ops packet_ops_spkt;
@@ -1902,18 +1916,10 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
 		goto retry;
 	}
 
-	if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
-		/* Earlier code assumed this would be a VLAN pkt,
-		 * double-check this now that we have the actual
-		 * packet in hand.
-		 */
-		struct ethhdr *ehdr;
-		skb_reset_mac_header(skb);
-		ehdr = eth_hdr(skb);
-		if (ehdr->h_proto != htons(ETH_P_8021Q)) {
-			err = -EMSGSIZE;
-			goto out_unlock;
-		}
+	if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
+	    !packet_extra_vlan_len_allowed(dev, skb)) {
+		err = -EMSGSIZE;
+		goto out_unlock;
 	}
 
 	skb->protocol = proto;
@@ -2525,18 +2531,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
 					  addr, hlen);
 		if (likely(tp_len >= 0) &&
-		    tp_len > dev->mtu + dev->hard_header_len) {
-			struct ethhdr *ehdr;
-			/* Earlier code assumed this would be a VLAN pkt,
-			 * double-check this now that we have the actual
-			 * packet in hand.
-			 */
+		    tp_len > dev->mtu + dev->hard_header_len &&
+		    !packet_extra_vlan_len_allowed(dev, skb))
+			tp_len = -EMSGSIZE;
 
-			skb_reset_mac_header(skb);
-			ehdr = eth_hdr(skb);
-			if (ehdr->h_proto != htons(ETH_P_8021Q))
-				tp_len = -EMSGSIZE;
-		}
 		if (unlikely(tp_len < 0)) {
 			if (po->tp_loss) {
 				__packet_set_status(po, ph,
@@ -2765,18 +2763,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
 	sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 
-	if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
-		/* Earlier code assumed this would be a VLAN pkt,
-		 * double-check this now that we have the actual
-		 * packet in hand.
-		 */
-		struct ethhdr *ehdr;
-		skb_reset_mac_header(skb);
-		ehdr = eth_hdr(skb);
-		if (ehdr->h_proto != htons(ETH_P_8021Q)) {
-			err = -EMSGSIZE;
-			goto out_free;
-		}
+	if (!gso_type && (len > dev->mtu + reserve + extra_len) &&
+	    !packet_extra_vlan_len_allowed(dev, skb)) {
+		err = -EMSGSIZE;
+		goto out_free;
 	}
 
 	skb->protocol = proto;

From c72219b75fde768efccf7666342282fab7f9e4e7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 11 Nov 2015 23:25:43 +0100
Subject: [PATCH 4/5] packet: infer protocol from ethernet header if unset

In case no struct sockaddr_ll has been passed to packet
socket's sendmsg() when doing a TX_RING flush run, then
skb->protocol is set to po->num instead, which is the protocol
passed via socket(2)/bind(2).

Applications only xmitting can go the path of allocating the
socket as socket(PF_PACKET, <mode>, 0) and do a bind(2) on the
TX_RING with sll_protocol of 0. That way, register_prot_hook()
is neither called on creation nor on bind time, which saves
cycles when there's no interest in capturing anyway.

That leaves us however with po->num 0 instead and therefore
the TX_RING flush run sets skb->protocol to 0 as well. Eric
reported that this leads to problems when using tools like
trafgen over bonding device. I.e. the bonding's hash function
could invoke the kernel's flow dissector, which depends on
skb->protocol being properly set. In the current situation, all
the traffic is then directed to a single slave.

Fix it up by inferring skb->protocol from the Ethernet header
when not set and we have ARPHRD_ETHER device type. This is only
done in case of SOCK_RAW and where we have a dev->hard_header_len
length. In case of ARPHRD_ETHER devices, this is guaranteed to
cover ETH_HLEN, and therefore being accessed on the skb after
the skb_store_bits().

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8795b0fb1ed1c..0066da2b8e444 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2338,6 +2338,15 @@ static bool ll_header_truncated(const struct net_device *dev, int len)
 	return false;
 }
 
+static void tpacket_set_protocol(const struct net_device *dev,
+				 struct sk_buff *skb)
+{
+	if (dev->type == ARPHRD_ETHER) {
+		skb_reset_mac_header(skb);
+		skb->protocol = eth_hdr(skb)->h_proto;
+	}
+}
+
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		void *frame, struct net_device *dev, int size_max,
 		__be16 proto, unsigned char *addr, int hlen)
@@ -2419,6 +2428,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 				dev->hard_header_len);
 		if (unlikely(err))
 			return err;
+		if (!skb->protocol)
+			tpacket_set_protocol(dev, skb);
 
 		data += dev->hard_header_len;
 		to_write -= dev->hard_header_len;

From 5cfb4c8d05b4409c4044cb9c05b19705c1d9818b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 11 Nov 2015 23:25:44 +0100
Subject: [PATCH 5/5] packet: fix tpacket_snd max frame len

Since it's introduction in commit 69e3c75f4d54 ("net: TX_RING and
packet mmap"), TX_RING could be used from SOCK_DGRAM and SOCK_RAW
side. When used with SOCK_DGRAM only, the size_max > dev->mtu +
reserve check should have reserve as 0, but currently, this is
unconditionally set (in it's original form as dev->hard_header_len).

I think this is not correct since tpacket_fill_skb() would then
take dev->mtu and dev->hard_header_len into account for SOCK_DGRAM,
the extra VLAN_HLEN could be possible in both cases. Presumably, the
reserve code was copied from packet_snd(), but later on missed the
check. Make it similar as we have it in packet_snd().

Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0066da2b8e444..242bce1cf0f3a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2510,12 +2510,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	if (unlikely(!(dev->flags & IFF_UP)))
 		goto out_put;
 
-	reserve = dev->hard_header_len + VLAN_HLEN;
+	if (po->sk.sk_socket->type == SOCK_RAW)
+		reserve = dev->hard_header_len;
 	size_max = po->tx_ring.frame_size
 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
 
-	if (size_max > dev->mtu + reserve)
-		size_max = dev->mtu + reserve;
+	if (size_max > dev->mtu + reserve + VLAN_HLEN)
+		size_max = dev->mtu + reserve + VLAN_HLEN;
 
 	do {
 		ph = packet_current_frame(po, &po->tx_ring,
@@ -2542,7 +2543,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
 					  addr, hlen);
 		if (likely(tp_len >= 0) &&
-		    tp_len > dev->mtu + dev->hard_header_len &&
+		    tp_len > dev->mtu + reserve &&
 		    !packet_extra_vlan_len_allowed(dev, skb))
 			tp_len = -EMSGSIZE;