From 1f85e6267caca44b30c54711652b0726fadbb131 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Dec 2019 12:55:29 -0800 Subject: [PATCH 1/3] tcp: do not send empty skb from tcp_write_xmit() Backport of commit fdfc5c8594c2 ("tcp: remove empty skb from write queue in error cases") in linux-4.14 stable triggered various bugs. One of them has been fixed in commit ba2ddb43f270 ("tcp: Don't dequeue SYN/FIN-segments from write-queue"), but we still have crashes in some occasions. Root-cause is that when tcp_sendmsg() has allocated a fresh skb and could not append a fragment before being blocked in sk_stream_wait_memory(), tcp_write_xmit() might be called and decide to send this fresh and empty skb. Sending an empty packet is not only silly, it might have caused many issues we had in the past with tp->packets_out being out of sync. Fixes: c65f7f00c587 ("[TCP]: Simplify SKB data portion allocation with NETIF_F_SG.") Signed-off-by: Eric Dumazet Cc: Christoph Paasch Acked-by: Neal Cardwell Cc: Jason Baron Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b184f03d74371..57f434a8e41ff 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2438,6 +2438,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (tcp_small_queue_check(sk, skb, 0)) break; + /* Argh, we hit an empty skb(), presumably a thread + * is sleeping in sendmsg()/sk_stream_wait_memory(). + * We do not want to send a pure-ack packet and have + * a strange looking rtx queue with empty packet(s). + */ + if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) + break; + if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; From ee2aabd3fc2eef4c1a0ebdadccc76fbff74b94fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Dec 2019 12:55:30 -0800 Subject: [PATCH 2/3] tcp: refine tcp_write_queue_empty() implementation Due to how tcp_sendmsg() is implemented, we can have an empty skb at the tail of the write queue. Most [1] tcp_write_queue_empty() callers want to know if there is anything to send (payload and/or FIN) Instead of checking if the sk_write_queue is empty, we need to test if tp->write_seq == tp->snd_nxt [1] tcp_send_fin() was the only caller that expected to see if an skb was in the write queue, I have changed the code to reuse the tcp_write_queue_tail() result. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 11 ++++++++++- net/ipv4/tcp_output.c | 5 +++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 86b9a87666488..e460ea7f767ba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1766,9 +1766,18 @@ static inline bool tcp_skb_is_last(const struct sock *sk, return skb_queue_is_last(&sk->sk_write_queue, skb); } +/** + * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue + * @sk: socket + * + * Since the write queue can have a temporary empty skb in it, + * we must not use "return skb_queue_empty(&sk->sk_write_queue)" + */ static inline bool tcp_write_queue_empty(const struct sock *sk) { - return skb_queue_empty(&sk->sk_write_queue); + const struct tcp_sock *tp = tcp_sk(sk); + + return tp->write_seq == tp->snd_nxt; } static inline bool tcp_rtx_queue_empty(const struct sock *sk) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 57f434a8e41ff..36902d08473ec 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3129,7 +3129,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) */ void tcp_send_fin(struct sock *sk) { - struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); + struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and @@ -3137,6 +3137,7 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ + tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); @@ -3144,7 +3145,7 @@ void tcp_send_fin(struct sock *sk) TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (tcp_write_queue_empty(sk)) { + if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have From 216808c6ba6d00169fd2aa928ec3c0e63bef254f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Dec 2019 12:55:31 -0800 Subject: [PATCH 3/3] tcp: refine rule to allow EPOLLOUT generation under mem pressure At the time commit ce5ec440994b ("tcp: ensure epoll edge trigger wakeup when write queue is empty") was added to the kernel, we still had a single write queue, combining rtx and write queues. Once we moved the rtx queue into a separate rb-tree, testing if sk_write_queue is empty has been suboptimal. Indeed, if we have packets in the rtx queue, we probably want to delay the EPOLLOUT generation at the time incoming packets will free them, making room, but more importantly avoiding flooding application with EPOLLOUT events. Solution is to use tcp_rtx_and_write_queues_empty() helper. Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") Signed-off-by: Eric Dumazet Cc: Jason Baron Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8a39ee7948919..716938313a325 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1087,8 +1087,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, goto out; out_err: /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && - err == -EAGAIN)) { + if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } @@ -1419,8 +1418,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) sock_zerocopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && - err == -EAGAIN)) { + if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); }