From e9c6e79760265f019cde39d3f2c443dfbc1395b0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 17 Aug 2022 12:54:42 -0700 Subject: [PATCH 1/4] tcp: fix sock skb accounting in tcp_read_skb() Before commit 965b57b469a5 ("net: Introduce a new proto_ops ->read_skb()"), skb was not dequeued from receive queue hence when we close TCP socket skb can be just flushed synchronously. After this commit, we have to uncharge skb immediately after being dequeued, otherwise it is still charged in the original sock. And we still need to retain skb->sk, as eBPF programs may extract sock information from skb->sk. Therefore, we have to call skb_set_owner_sk_safe() here. Fixes: 965b57b469a5 ("net: Introduce a new proto_ops ->read_skb()") Reported-and-tested-by: syzbot+a0e6f8738b58f7654417@syzkaller.appspotmail.com Tested-by: Stanislav Fomichev Cc: Eric Dumazet Cc: John Fastabend Cc: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 970e9a2cca4ae..05da5cac080b5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1760,6 +1760,7 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) int used; __skb_unlink(skb, &sk->sk_receive_queue); + WARN_ON(!skb_set_owner_sk_safe(skb, sk)); used = recv_actor(sk, skb); if (used <= 0) { if (!copied) From c457985aaa92e1fda2ce837cabf90bf687b92dcb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 17 Aug 2022 12:54:43 -0700 Subject: [PATCH 2/4] tcp: fix tcp_cleanup_rbuf() for tcp_read_skb() tcp_cleanup_rbuf() retrieves the skb from sk_receive_queue, it assumes the skb is not yet dequeued. This is no longer true for tcp_read_skb() case where we dequeue the skb first. Fix this by introducing a helper __tcp_cleanup_rbuf() which does not require any skb and calling it in tcp_read_skb(). Fixes: 04919bed948d ("tcp: Introduce tcp_read_skb()") Cc: Eric Dumazet Cc: John Fastabend Cc: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 05da5cac080b5..181a0d3501233 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1567,17 +1567,11 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -void tcp_cleanup_rbuf(struct sock *sk, int copied) +static void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - - WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), - "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", - tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); - if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1623,6 +1617,17 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } +void tcp_cleanup_rbuf(struct sock *sk, int copied) +{ + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + struct tcp_sock *tp = tcp_sk(sk); + + WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), + "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); + __tcp_cleanup_rbuf(sk, copied); +} + static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); @@ -1771,20 +1776,19 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) copied += used; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { - consume_skb(skb); ++seq; break; } - consume_skb(skb); break; } + consume_skb(skb); WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (copied > 0) - tcp_cleanup_rbuf(sk, copied); + __tcp_cleanup_rbuf(sk, copied); return copied; } From a8688821f3854f37fe0198b8945f9cfc051ab2cf Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 17 Aug 2022 12:54:44 -0700 Subject: [PATCH 3/4] tcp: refactor tcp_read_skb() a bit As tcp_read_skb() only reads one skb at a time, the while loop is unnecessary, we can turn it into an if. This also simplifies the code logic. Cc: Eric Dumazet Cc: John Fastabend Cc: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 181a0d3501233..56a554b49caa6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1761,25 +1761,17 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; - while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { - int used; - - __skb_unlink(skb, &sk->sk_receive_queue); - WARN_ON(!skb_set_owner_sk_safe(skb, sk)); - used = recv_actor(sk, skb); - if (used <= 0) { - if (!copied) - copied = used; - break; - } - seq += used; - copied += used; + skb = tcp_recv_skb(sk, seq, &offset); + if (!skb) + return 0; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + __skb_unlink(skb, &sk->sk_receive_queue); + WARN_ON(!skb_set_owner_sk_safe(skb, sk)); + copied = recv_actor(sk, skb); + if (copied > 0) { + seq += copied; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ++seq; - break; - } - break; } consume_skb(skb); WRITE_ONCE(tp->copied_seq, seq); From 2e23acd99efacfd2a63cb9725afbc65e4e964fb7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 17 Aug 2022 12:54:45 -0700 Subject: [PATCH 4/4] tcp: handle pure FIN case correctly When skb->len==0, the recv_actor() returns 0 too, but we also use 0 for error conditions. This patch amends this by propagating the errors to tcp_read_skb() so that we can distinguish skb->len==0 case from error cases. Fixes: 04919bed948d ("tcp: Introduce tcp_read_skb()") Reported-by: Eric Dumazet Cc: John Fastabend Cc: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Jakub Kicinski --- net/core/skmsg.c | 5 +++-- net/ipv4/tcp.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f47338d89d5d5..59e75ffcc1f40 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1194,8 +1194,9 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } - if (sk_psock_verdict_apply(psock, skb, ret) < 0) - len = 0; + ret = sk_psock_verdict_apply(psock, skb, ret); + if (ret < 0) + len = ret; out: rcu_read_unlock(); return len; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 56a554b49caa6..bbe2187536620 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1768,7 +1768,7 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) __skb_unlink(skb, &sk->sk_receive_queue); WARN_ON(!skb_set_owner_sk_safe(skb, sk)); copied = recv_actor(sk, skb); - if (copied > 0) { + if (copied >= 0) { seq += copied; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ++seq;