From d44fd4a767b3755899f8ad1df3e8eca3961ba708 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:11 +0000 Subject: [PATCH 1/6] tcp: set TCP_SYNCNT locklessly icsk->icsk_syn_retries can safely be set without locking the socket. We have to add READ_ONCE() annotations in tcp_fastopen_synack_timer() and tcp_write_timeout(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 ++++++--------- net/ipv4/tcp_timer.c | 9 ++++++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index aca5620cf3ba2..bcbb33a8c152a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3291,9 +3291,7 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; - lock_sock(sk); WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); @@ -3462,6 +3460,12 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; + /* Handle options that can be set without locking the socket. */ + switch (optname) { + case TCP_SYNCNT: + return tcp_sock_set_syncnt(sk, val); + } + sockopt_lock_sock(sk); switch (optname) { @@ -3569,13 +3573,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, else WRITE_ONCE(tp->keepalive_probes, val); break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - err = -EINVAL; - else - WRITE_ONCE(icsk->icsk_syn_retries, val); - break; - case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 470f581eedd43..66040ab457d46 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -239,7 +239,8 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? : + /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ + retry_until = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); max_retransmits = retry_until; @@ -421,8 +422,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) req->rsk_ops->syn_ack_timeout(req); - /* add one more retry for fastopen */ - max_retries = icsk->icsk_syn_retries ? : + /* Add one more retry for fastopen. + * Paired with WRITE_ONCE() in tcp_sock_set_syncnt() + */ + max_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; if (req->num_timeout >= max_retries) { From d58f2e15aa0c07f6f03ec71f64d7697ca43d04a1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:12 +0000 Subject: [PATCH 2/6] tcp: set TCP_USER_TIMEOUT locklessly icsk->icsk_user_timeout can be set locklessly, if all read sides use READ_ONCE(). Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/tcp.c | 23 ++++++++++------------- net/ipv4/tcp_timer.c | 37 ++++++++++++++++++++++--------------- 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d16abdb3541a6..3c5efeeb024f6 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -564,6 +564,6 @@ void __tcp_sock_set_nodelay(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); -void tcp_sock_set_user_timeout(struct sock *sk, u32 val); +int tcp_sock_set_user_timeout(struct sock *sk, int val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bcbb33a8c152a..34c2a40b02477 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3296,11 +3296,16 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_syncnt); -void tcp_sock_set_user_timeout(struct sock *sk, u32 val) +int tcp_sock_set_user_timeout(struct sock *sk, int val) { - lock_sock(sk); + /* Cap the max time in ms TCP will retry or probe the window + * before giving up and aborting (ETIMEDOUT) a connection. + */ + if (val < 0) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); - release_sock(sk); + return 0; } EXPORT_SYMBOL(tcp_sock_set_user_timeout); @@ -3464,6 +3469,8 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case TCP_SYNCNT: return tcp_sock_set_syncnt(sk, val); + case TCP_USER_TIMEOUT: + return tcp_sock_set_user_timeout(sk, val); } sockopt_lock_sock(sk); @@ -3611,16 +3618,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif - case TCP_USER_TIMEOUT: - /* Cap the max time in ms TCP will retry or probe the window - * before giving up and aborting (ETIMEDOUT) a connection. - */ - if (val < 0) - err = -EINVAL; - else - WRITE_ONCE(icsk->icsk_user_timeout, val); - break; - case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 66040ab457d46..f99e2d06ae7ca 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -26,14 +26,15 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 elapsed, start_ts; + u32 elapsed, start_ts, user_timeout; s32 remaining; start_ts = tcp_sk(sk)->retrans_stamp; - if (!icsk->icsk_user_timeout) + user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (!user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; - remaining = icsk->icsk_user_timeout - elapsed; + remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ @@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 remaining; + u32 remaining, user_timeout; s32 elapsed; - if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp) + user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (!user_timeout || !icsk->icsk_probes_tstamp) return when; elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; if (unlikely(elapsed < 0)) elapsed = 0; - remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed; + remaining = msecs_to_jiffies(user_timeout) - elapsed; remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); return min_t(u32, remaining, when); @@ -270,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk) } if (!expired) expired = retransmits_timed_out(sk, retry_until, - icsk->icsk_user_timeout); + READ_ONCE(icsk->icsk_user_timeout)); tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) @@ -384,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - if (!icsk->icsk_probes_tstamp) + if (!icsk->icsk_probes_tstamp) { icsk->icsk_probes_tstamp = tcp_jiffies32; - else if (icsk->icsk_user_timeout && - (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= - msecs_to_jiffies(icsk->icsk_user_timeout)) - goto abort; + } else { + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (user_timeout && + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= + msecs_to_jiffies(user_timeout)) + goto abort; + } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -734,13 +739,15 @@ static void tcp_keepalive_timer (struct timer_list *t) elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); + /* If the TCP_USER_TIMEOUT option is enabled, use that * to determine when to timeout instead. */ - if ((icsk->icsk_user_timeout != 0 && - elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && + if ((user_timeout != 0 && + elapsed >= msecs_to_jiffies(user_timeout) && icsk->icsk_probes_out > 0) || - (icsk->icsk_user_timeout == 0 && + (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); From 6fd70a6b4e6f58482a43da46d12323795bbc5f68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:13 +0000 Subject: [PATCH 3/6] tcp: set TCP_KEEPINTVL locklessly tp->keepalive_intvl can be set locklessly, readers are already taking care of this field being potentially set by other threads. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 34c2a40b02477..75d6359ee5750 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3348,9 +3348,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; - lock_sock(sk); WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); @@ -3471,6 +3469,8 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, return tcp_sock_set_syncnt(sk, val); case TCP_USER_TIMEOUT: return tcp_sock_set_user_timeout(sk, val); + case TCP_KEEPINTVL: + return tcp_sock_set_keepintvl(sk, val); } sockopt_lock_sock(sk); @@ -3568,12 +3568,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) - err = -EINVAL; - else - WRITE_ONCE(tp->keepalive_intvl, val * HZ); - break; case TCP_KEEPCNT: if (val < 1 || val > MAX_TCP_KEEPCNT) err = -EINVAL; From 84485080cbc1e5a011e7549966739df4cec158b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:14 +0000 Subject: [PATCH 4/6] tcp: set TCP_KEEPCNT locklessly tp->keepalive_probes can be set locklessly, readers are already taking care of this field being potentially set by other threads. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 75d6359ee5750..e74a9593283c9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3358,10 +3358,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; - lock_sock(sk); /* Paired with READ_ONCE() in keepalive_probes() */ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); @@ -3471,6 +3469,8 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, return tcp_sock_set_user_timeout(sk, val); case TCP_KEEPINTVL: return tcp_sock_set_keepintvl(sk, val); + case TCP_KEEPCNT: + return tcp_sock_set_keepcnt(sk, val); } sockopt_lock_sock(sk); @@ -3568,12 +3568,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - err = -EINVAL; - else - WRITE_ONCE(tp->keepalive_probes, val); - break; case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) From a81722ddd7e4d76c9bbff078d29416e18c6d7f71 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:15 +0000 Subject: [PATCH 5/6] tcp: set TCP_LINGER2 locklessly tp->linger2 can be set locklessly as long as readers use READ_ONCE(). Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 19 +++++++++---------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_timer.c | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e74a9593283c9..5c71b4fe11d1c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2865,7 +2865,7 @@ void __tcp_close(struct sock *sk, long timeout) if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), @@ -3471,6 +3471,14 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, return tcp_sock_set_keepintvl(sk, val); case TCP_KEEPCNT: return tcp_sock_set_keepcnt(sk, val); + case TCP_LINGER2: + if (val < 0) + WRITE_ONCE(tp->linger2, -1); + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) + WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); + else + WRITE_ONCE(tp->linger2, val * HZ); + return 0; } sockopt_lock_sock(sk); @@ -3576,15 +3584,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, tp->save_syn = val; break; - case TCP_LINGER2: - if (val < 0) - WRITE_ONCE(tp->linger2, -1); - else if (val > TCP_FIN_TIMEOUT_MAX / HZ) - WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); - else - WRITE_ONCE(tp->linger2, val * HZ); - break; - case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2995802126e4e..deac708250e68 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6624,7 +6624,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; } - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f99e2d06ae7ca..d45c96c7f5a44 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -714,7 +714,7 @@ static void tcp_keepalive_timer (struct timer_list *t) tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { - if (tp->linger2 >= 0) { + if (READ_ONCE(tp->linger2) >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { From 6e97ba552b8d3dd074a28b8600740b8bed42267b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:16 +0000 Subject: [PATCH 6/6] tcp: set TCP_DEFER_ACCEPT locklessly rskq_defer_accept field can be read/written without the need of holding the socket lock. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 13 ++++++------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5c71b4fe11d1c..4fbc7ff8c53c0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3479,6 +3479,12 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, else WRITE_ONCE(tp->linger2, val * HZ); return 0; + case TCP_DEFER_ACCEPT: + /* Translate value in seconds to number of retransmits */ + WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ)); + return 0; } sockopt_lock_sock(sk); @@ -3584,13 +3590,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, tp->save_syn = val; break; - case TCP_DEFER_ACCEPT: - /* Translate value in seconds to number of retransmits */ - WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, - TCP_RTO_MAX / HZ)); - break; - case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index deac708250e68..8e96ebe373d7e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6324,7 +6324,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (fastopen_fail) return -1; if (sk->sk_write_pending || - icsk->icsk_accept_queue.rskq_defer_accept || + READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) || inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ddba3b67f7c8e..13ee12983c429 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -792,7 +792,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ - if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);