Skip to content

Commit

Permalink
tcp: implement coalescing on backlog queue
Browse files Browse the repository at this point in the history
In case GRO is not as efficient as it should be or disabled,
we might have a user thread trapped in __release_sock() while
softirq handler flood packets up to the point we have to drop.

This patch balances work done from user thread and softirq,
to give more chances to __release_sock() to complete its work
before new packets are added the the backlog.

This also helps if we receive many ACK packets, since GRO
does not aggregate them.

This patch brings ~60% throughput increase on a receiver
without GRO, but the spectacular gain is really on
1000x release_sock() latency reduction I have measured.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eric Dumazet authored and David S. Miller committed Nov 30, 2018
1 parent 85bdf7d commit 4f693b5
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 6 deletions.
1 change: 1 addition & 0 deletions include/uapi/linux/snmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ enum
LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */
LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */
LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */
LINUX_MIB_TCPBACKLOGCOALESCE, /* TCPBacklogCoalesce */
LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */
LINUX_MIB_TCPOFODROP, /* TCPOFODrop */
LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */
Expand Down
1 change: 1 addition & 0 deletions net/ipv4/proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
Expand Down
92 changes: 86 additions & 6 deletions net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;

/* Only socket owner can try to collapse/prune rx queues
* to reduce memory overhead, so add a little headroom here.
* Few sockets backlog are possibly concurrently non empty.
*/
limit += 64*1024;
struct skb_shared_info *shinfo;
const struct tcphdr *th;
struct tcphdr *thtail;
struct sk_buff *tail;
unsigned int hdrlen;
bool fragstolen;
u32 gso_segs;
int delta;

/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
Expand All @@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)

skb_dst_drop(skb);

if (unlikely(tcp_checksum_complete(skb))) {
bh_unlock_sock(sk);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
return true;
}

/* Attempt coalescing to last skb in backlog, even if we are
* above the limits.
* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
*/
th = (const struct tcphdr *)skb->data;
hdrlen = th->doff * 4;
shinfo = skb_shinfo(skb);

if (!shinfo->gso_size)
shinfo->gso_size = skb->len - hdrlen;

if (!shinfo->gso_segs)
shinfo->gso_segs = 1;

tail = sk->sk_backlog.tail;
if (!tail)
goto no_coalesce;
thtail = (struct tcphdr *)tail->data;

if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
((TCP_SKB_CB(tail)->tcp_flags |
TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
((TCP_SKB_CB(tail)->tcp_flags ^
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
#ifdef CONFIG_TLS_DEVICE
tail->decrypted != skb->decrypted ||
#endif
thtail->doff != th->doff ||
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
goto no_coalesce;

__skb_pull(skb, hdrlen);
if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
thtail->window = th->window;

TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;

if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;

TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;

if (TCP_SKB_CB(skb)->has_rxtstamp) {
TCP_SKB_CB(tail)->has_rxtstamp = true;
tail->tstamp = skb->tstamp;
skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
}

/* Not as strict as GRO. We only need to carry mss max value */
skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
skb_shinfo(tail)->gso_size);

gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);

sk->sk_backlog.len += delta;
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPBACKLOGCOALESCE);
kfree_skb_partial(skb, fragstolen);
return false;
}
__skb_push(skb, hdrlen);

no_coalesce:
/* Only socket owner can try to collapse/prune rx queues
* to reduce memory overhead, so add a little headroom here.
* Few sockets backlog are possibly concurrently non empty.
*/
limit += 64*1024;

if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
Expand Down

0 comments on commit 4f693b5

Please sign in to comment.