Skip to content

Commit

Permalink
net: add support for ipv4 big tcp
Browse files Browse the repository at this point in the history
Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP.

Firstly, allow sk->sk_gso_max_size to be set to a value greater than
GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size()
for IPv4 TCP sockets.

Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU
in __ip_local_out() to allow to send BIG TCP packets, and this implies
that skb->len is the length of a IPv4 packet; On RX path, use skb->len
as the length of the IPv4 packet when the IP header tot_len is 0 and
skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and
skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only
need to update these APIs.

Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows
the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In
GRO complete, set IP header tot_len to 0 when the merged packet size
greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed
on RX path.

Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes
this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP
packets.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Xin Long authored and Jakub Kicinski committed Feb 2, 2023
1 parent 9eefedd commit b1a78b9
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 22 deletions.
12 changes: 7 additions & 5 deletions net/core/gro.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
struct sk_buff *lp;
int segs;

/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
gro_max_size = READ_ONCE(p->dev->gro_max_size);
/* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
READ_ONCE(p->dev->gro_max_size) :
READ_ONCE(p->dev->gro_ipv4_max_size);

if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
return -E2BIG;

if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
if (p->protocol != htons(ETH_P_IPV6) ||
skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
(p->protocol == htons(ETH_P_IPV6) &&
skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
p->encapsulation)
return -E2BIG;
}
Expand Down
26 changes: 14 additions & 12 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -2373,17 +2373,22 @@ void sk_free_unlock_clone(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_free_unlock_clone);

static void sk_trim_gso_size(struct sock *sk)
static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
{
if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
return;
bool is_ipv6 = false;
u32 max_size;

#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6 &&
sk_is_tcp(sk) &&
!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
return;
is_ipv6 = (sk->sk_family == AF_INET6 &&
!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
#endif
sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
READ_ONCE(dst->dev->gso_ipv4_max_size);
if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
max_size = GSO_LEGACY_MAX_SIZE;

return max_size - (MAX_TCP_HEADER + 1);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
Expand All @@ -2403,10 +2408,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
sk_trim_gso_size(sk);
sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
}
Expand Down
7 changes: 4 additions & 3 deletions net/ipv4/af_inet.c
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
if (unlikely(ip_fast_csum((u8 *)iph, 5)))
goto out;

NAPI_GRO_CB(skb)->proto = proto;
id = ntohl(*(__be32 *)&iph->id);
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
Expand Down Expand Up @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)

int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
__be16 newlen = htons(skb->len - nhoff);
struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
const struct net_offload *ops;
__be16 totlen = iph->tot_len;
int proto = iph->protocol;
int err = -ENOSYS;

Expand All @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
skb_set_inner_network_header(skb, nhoff);
}

csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
iph_set_totlen(iph, skb->len - nhoff);
csum_replace2(&iph->check, totlen, iph->tot_len);

ops = rcu_dereference(inet_offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/ip_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto csum_error;

len = ntohs(iph->tot_len);
len = iph_totlen(skb, iph);
if (skb->len < len) {
drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/ip_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);

iph->tot_len = htons(skb->len);
iph_set_totlen(iph, skb->len);
ip_send_check(iph);

/* if egress device is enslaved to an L3 master device pass the
Expand Down

0 comments on commit b1a78b9

Please sign in to comment.