From 225d9ddbacb102621af6d28ff7bf5a0b4ce249d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:36 +0000 Subject: [PATCH 01/13] chtls: fix tp->rcv_tstamp initialization tp->rcv_tstamp should be set to tcp_jiffies, not tcp_time_stamp(). Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Eric Dumazet Cc: Ayush Sawal Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 7750702900fa6..6f6525983130e 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -2259,7 +2259,7 @@ static void chtls_rx_ack(struct sock *sk, struct sk_buff *skb) if (tp->snd_una != snd_una) { tp->snd_una = snd_una; - tp->rcv_tstamp = tcp_time_stamp(tp); + tp->rcv_tstamp = tcp_jiffies32; if (tp->snd_una == tp->snd_nxt && !csk_flag_nochk(csk, CSK_TX_FAILOVER)) csk_reset_flag(csk, CSK_TX_WAIT_IDLE); From 73ed8e03388d16c12fc577e5c700b58a29045a15 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:37 +0000 Subject: [PATCH 02/13] tcp: fix cookie_init_timestamp() overflows cookie_init_timestamp() is supposed to return a 64bit timestamp suitable for both TSval determination and setting of skb->tstamp. Unfortunately it uses 32bit fields and overflows after 2^32 * 10^6 nsec (~49 days) of uptime. Generated TSval are still correct, but skb->tstamp might be set far away in the past, potentially confusing other layers. tcp_ns_to_ts() is changed to return a full 64bit value, ts and ts_now variables are changed to u64 type, and TSMASK is removed in favor of shifts operations. While we are at it, change this sequence: ts >>= TSBITS; ts--; ts <<= TSBITS; ts |= options; to: ts -= (1UL << TSBITS); Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/syncookies.c | 20 +++++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index bad304d173a56..d47a57a47b50b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -805,7 +805,7 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) } /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ -static inline u32 tcp_ns_to_ts(u64 ns) +static inline u64 tcp_ns_to_ts(u64 ns) { return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index dc478a0574cbe..3b4dafefb4b03 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -41,7 +41,6 @@ static siphash_aligned_key_t syncookie_secret[2]; * requested/supported by the syn/synack exchange. */ #define TSBITS 6 -#define TSMASK (((__u32)1 << TSBITS) - 1) static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) @@ -62,27 +61,22 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, */ u64 cookie_init_timestamp(struct request_sock *req, u64 now) { - struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_ns_to_ts(now); + const struct inet_request_sock *ireq = inet_rsk(req); + u64 ts, ts_now = tcp_ns_to_ts(now); u32 options = 0; - ireq = inet_rsk(req); - options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; if (ireq->sack_ok) options |= TS_OPT_SACK; if (ireq->ecn_ok) options |= TS_OPT_ECN; - ts = ts_now & ~TSMASK; + ts = (ts_now >> TSBITS) << TSBITS; ts |= options; - if (ts > ts_now) { - ts >>= TSBITS; - ts--; - ts <<= TSBITS; - ts |= options; - } - return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ); + if (ts > ts_now) + ts -= (1UL << TSBITS); + + return ts * (NSEC_PER_SEC / TCP_TS_HZ); } From 99d679556d737a14391c68e562d94076c2983252 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:38 +0000 Subject: [PATCH 03/13] tcp: add tcp_time_stamp_ms() helper In preparation of adding usec TCP TS values, add tcp_time_stamp_ms() for contexts needing ms based values. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 5 +++++ net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_timer.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index d47a57a47b50b..9fc6dc4ba9e2e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -804,6 +804,11 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); } +static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) +{ + return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); +} + /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ static inline u64 tcp_ns_to_ts(u64 ns) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ab87f0285b728..ffce17545b62c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2856,7 +2856,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) static void tcp_update_rto_time(struct tcp_sock *tp) { if (tp->rto_stamp) { - tp->total_rto_time += tcp_time_stamp(tp) - tp->rto_stamp; + tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp; tp->rto_stamp = 0; } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0862b73dd3b52..63247c78dc13d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -422,7 +422,7 @@ static void tcp_update_rto_stats(struct sock *sk) if (!icsk->icsk_retransmits) { tp->total_rto_recoveries++; - tp->rto_stamp = tcp_time_stamp(tp); + tp->rto_stamp = tcp_time_stamp_ms(tp); } icsk->icsk_retransmits++; tp->total_rto++; From 2a7c8d291ffeba69a47d8528987156f625cc05b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:39 +0000 Subject: [PATCH 04/13] tcp: introduce tcp_clock_ms() It delivers current TCP time stamp in ms unit, and is used in place of confusing tcp_time_stamp_raw() It is the same family than tcp_clock_ns() and tcp_clock_ms(). tcp_time_stamp_raw() will be replaced later for TSval contexts with a more descriptive name. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 5 +++++ net/ipv4/tcp.c | 6 ++---- net/ipv4/tcp_minisocks.c | 4 ++-- net/netfilter/nf_synproxy_core.c | 2 +- tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c | 4 ++-- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 9fc6dc4ba9e2e..3bdf1141f5a2c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -798,6 +798,11 @@ static inline u64 tcp_clock_us(void) return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } +static inline u64 tcp_clock_ms(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_MSEC); +} + /* This should only be used in contexts where tp->tcp_mstamp is up to date */ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 56a8d936000f6..5b034b0356ecb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3817,10 +3817,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_total_rto = tp->total_rto; info->tcpi_total_rto_recoveries = tp->total_rto_recoveries; info->tcpi_total_rto_time = tp->total_rto_time; - if (tp->rto_stamp) { - info->tcpi_total_rto_time += tcp_time_stamp_raw() - - tp->rto_stamp; - } + if (tp->rto_stamp) + info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; unlock_sock_fast(sk, slow); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3f87611077ef2..a9fdba897a28f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -567,8 +567,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto = req->num_timeout; newtp->total_rto_recoveries = 1; - newtp->total_rto_time = tcp_time_stamp_raw() - - newtp->retrans_stamp; + newtp->total_rto_time = tcp_clock_ms() - + newtp->retrans_stamp; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 16915f8eef2b1..467671f2d42f7 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -153,7 +153,7 @@ void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; - opts->tsval = tcp_time_stamp_raw() & ~0x3f; + opts->tsval = tcp_clock_ms() & ~0x3f; if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c index 07d786329105d..e959336c7a730 100644 --- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -177,7 +177,7 @@ static __always_inline __u32 tcp_ns_to_ts(__u64 ns) return ns / (NSEC_PER_SEC / TCP_TS_HZ); } -static __always_inline __u32 tcp_time_stamp_raw(void) +static __always_inline __u32 tcp_clock_ms(void) { return tcp_ns_to_ts(tcp_clock_ns()); } @@ -274,7 +274,7 @@ static __always_inline bool tscookie_init(struct tcphdr *tcp_header, if (!loop_ctx.option_timestamp) return false; - cookie = tcp_time_stamp_raw() & ~TSMASK; + cookie = tcp_clock_ms() & ~TSMASK; cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; if (loop_ctx.option_sack) cookie |= TS_OPT_SACK; From 16cf6477741bdaa287d5e4531a1a503618a41a22 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:40 +0000 Subject: [PATCH 05/13] tcp: replace tcp_time_stamp_raw() In preparation of usec TCP TS support, remove tcp_time_stamp_raw() in favor of tcp_clock_ts() helper. This helper will return a suitable 32bit result to feed TS values, depending on a socket field. Also add tcp_tw_tsval() and tcp_rsk_tsval() helpers to factorize the details. We do not yet support usec timestamps. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 25 +++++++++++++++++++------ net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 3bdf1141f5a2c..0534526a535da 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -803,6 +803,16 @@ static inline u64 tcp_clock_ms(void) return div_u64(tcp_clock_ns(), NSEC_PER_MSEC); } +/* TCP Timestamp included in TS option (RFC 1323) can either use ms + * or usec resolution. Each socket carries a flag to select one or other + * resolution, as the route attribute could change anytime. + * Each flow must stick to initial resolution. + */ +static inline u32 tcp_clock_ts(bool usec_ts) +{ + return usec_ts ? tcp_clock_us() : tcp_clock_ms(); +} + /* This should only be used in contexts where tp->tcp_mstamp is up to date */ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) { @@ -820,12 +830,6 @@ static inline u64 tcp_ns_to_ts(u64 ns) return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); } -/* Could use tcp_clock_us() / 1000, but this version uses a single divide */ -static inline u32 tcp_time_stamp_raw(void) -{ - return tcp_ns_to_ts(tcp_clock_ns()); -} - void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) @@ -844,6 +848,15 @@ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } +static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) +{ + return tcp_clock_ts(false) + tcptw->tw_ts_offset; +} + +static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) +{ + return tcp_clock_ts(false) + treq->ts_off; +} #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5b034b0356ecb..805f8341064fe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3632,7 +3632,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (!tp->repair) err = -EPERM; else - WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw()); + WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(false)); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); @@ -4143,7 +4143,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TIMESTAMP: - val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset); + val = tcp_clock_ts(false) + READ_ONCE(tp->tsoffset); break; case TCP_NOTSENT_LOWAT: val = READ_ONCE(tp->notsent_lowat); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a67a5de86253b..cdd65cc594bc4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -954,7 +954,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp_raw() + tcptw->tw_ts_offset, + tcp_tw_tsval(tcptw), tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -988,7 +988,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), 0, tcp_md5_do_lookup(sk, l3index, addr, AF_INET), diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d410703bb5a1e..1ee6517e9b2f9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1096,7 +1096,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp_raw() + tcptw->tw_ts_offset, + tcp_tw_tsval(tcptw), tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, tw->tw_txhash); @@ -1123,7 +1123,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), ipv6_get_dsfield(ipv6_hdr(skb)), 0, From d1a02ed66fe62aa2edd77bd54e270ebc33bd12ff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:41 +0000 Subject: [PATCH 06/13] tcp: rename tcp_skb_timestamp() This helper returns a 32bit TCP TSval from skb->tstamp. As we are going to support usec or ms units soon, rename it to tcp_skb_timestamp_ts() and add a boolean to select the unit. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 14 +++++++++----- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 8 ++++---- net/ipv4/tcp_timer.c | 4 ++-- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 0534526a535da..493f8550055bc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -837,17 +837,21 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } -static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) -{ - return tcp_ns_to_ts(skb->skb_mstamp_ns); -} - /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } +/* Provide skb TSval in usec or ms unit */ +static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb) +{ + if (usec_ts) + return tcp_skb_timestamp_us(skb); + + return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC); +} + static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) { return tcp_clock_ts(false) + tcptw->tw_ts_offset; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ffce17545b62c..de68cad82d19e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2442,7 +2442,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, const struct sk_buff *skb) { return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && - tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(false, skb)); } /* Nothing was retransmitted or returned timestamp is less diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 909f85aefd740..03a2a9fc0dc19 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -799,7 +799,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; + opts->tsval = tcp_skb_timestamp_ts(false, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -884,7 +884,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off; + opts->tsval = tcp_skb_timestamp_ts(false, skb) + tcp_rsk(req)->ts_off; opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -943,7 +943,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp_ts(false, skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -3379,7 +3379,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_skb_timestamp(skb); + tp->retrans_stamp = tcp_skb_timestamp_ts(false, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 63247c78dc13d..8764a9a2dc213 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -479,7 +479,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, return false; rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - - (tp->retrans_stamp ?: tcp_skb_timestamp(skb))); + (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb))); return rtx_delta > timeout; } @@ -534,7 +534,7 @@ void tcp_retransmit_timer(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 rtx_delta; - rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb)); + rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)); if (sk->sk_family == AF_INET) { net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &inet->inet_daddr, ntohs(inet->inet_dport), From 003e07a1e48e9423647d2fef1c86b4caab3a94be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:42 +0000 Subject: [PATCH 07/13] tcp: move tcp_ns_to_ts() to net/ipv4/syncookies.c tcp_ns_to_ts() is only used once from cookie_init_timestamp(). Also add the 'bool usec_ts' parameter to enable usec TS later. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 6 ------ net/ipv4/syncookies.c | 10 +++++++++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 493f8550055bc..b86abf1fbe460 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -824,12 +824,6 @@ static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); } -/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ -static inline u64 tcp_ns_to_ts(u64 ns) -{ - return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); -} - void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 3b4dafefb4b03..62395fdb0ca55 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -51,6 +51,14 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, count, &syncookie_secret[c]); } +/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ +static u64 tcp_ns_to_ts(bool usec_ts, u64 val) +{ + if (usec_ts) + return div_u64(val, NSEC_PER_USEC); + + return div_u64(val, NSEC_PER_MSEC); +} /* * when syncookies are in effect and tcp timestamps are enabled we encode @@ -62,7 +70,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u64 cookie_init_timestamp(struct request_sock *req, u64 now) { const struct inet_request_sock *ireq = inet_rsk(req); - u64 ts, ts_now = tcp_ns_to_ts(now); + u64 ts, ts_now = tcp_ns_to_ts(false, now); u32 options = 0; options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; From 9d0c00f5ca05be9e89649c156f9d5b9421fc534e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:43 +0000 Subject: [PATCH 08/13] tcp: rename tcp_time_stamp() to tcp_time_stamp_ts() This helper returns a TSval from a TCP socket. It currently calls tcp_time_stamp_ms() but will soon be able to return a usec based TSval, depending on an upcoming tp->tcp_usec_ts field. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 9 ++++----- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_lp.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 10 +++++----- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index b86abf1fbe460..af72c1dc37f3d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -813,15 +813,14 @@ static inline u32 tcp_clock_ts(bool usec_ts) return usec_ts ? tcp_clock_us() : tcp_clock_ms(); } -/* This should only be used in contexts where tp->tcp_mstamp is up to date */ -static inline u32 tcp_time_stamp(const struct tcp_sock *tp) +static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) { - return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); + return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); } -static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) +static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp) { - return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); + return tcp_time_stamp_ms(tp); } void tcp_mstamp_refresh(struct tcp_sock *tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index de68cad82d19e..e7e38fc1d62ff 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -704,7 +704,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { - u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { @@ -3148,7 +3148,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) { - u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) @@ -6293,7 +6293,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, - tcp_time_stamp(tp))) { + tcp_time_stamp_ts(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index ae36780977d27..52fe17167460f 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -272,7 +272,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); - u32 now = tcp_time_stamp(tp); + u32 now = tcp_time_stamp_ts(tp); u32 delta; if (sample->rtt_us > 0) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 03a2a9fc0dc19..a1fec8be9ac36 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3961,7 +3961,7 @@ int tcp_connect(struct sock *sk) tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tcp_mstamp_refresh(tp); - tp->retrans_stamp = tcp_time_stamp(tp); + tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8764a9a2dc213..bfcf3fe44c724 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -33,7 +33,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout) return icsk->icsk_rto; - elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; + elapsed = tcp_time_stamp_ts(tcp_sk(sk)) - start_ts; remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ @@ -226,7 +226,7 @@ static bool retransmits_timed_out(struct sock *sk, timeout = tcp_model_timeout(sk, boundary, rto_base); } - return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; + return (s32)(tcp_time_stamp_ts(tcp_sk(sk)) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ @@ -462,7 +462,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) req->num_timeout++; tcp_update_rto_stats(sk); if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_time_stamp(tp); + tp->retrans_stamp = tcp_time_stamp_ts(tp); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, req->timeout << req->num_timeout, TCP_RTO_MAX); } @@ -478,7 +478,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, if (rcv_delta <= timeout) return false; - rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - + rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb))); return rtx_delta > timeout; @@ -534,7 +534,7 @@ void tcp_retransmit_timer(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 rtx_delta; - rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)); + rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)); if (sk->sk_family == AF_INET) { net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &inet->inet_daddr, ntohs(inet->inet_dport), From b04c3320885a88a94e4bbb2f9dbc4871c9bc336f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:44 +0000 Subject: [PATCH 09/13] tcp: add tcp_rtt_tsopt_us() Before adding usec TS support, add tcp_rtt_tsopt_us() helper to factorize code. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e7e38fc1d62ff..5666f61371678 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -693,6 +693,21 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) tp->rcv_rtt_est.time = tp->tcp_mstamp; } +static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) +{ + u32 delta, delta_us; + + delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; + + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + return delta_us; + } + return -1; +} + static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { @@ -704,15 +719,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { - u32 delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us; - - if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { - if (!delta) - delta = 1; - delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - tcp_rcv_rtt_update(tp, delta_us, 0); - } + s32 delta = tcp_rtt_tsopt_us(tp); + + if (delta >= 0) + tcp_rcv_rtt_update(tp, delta, 0); } } @@ -3146,17 +3156,10 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - flag & FLAG_ACKED) { - u32 delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; - - if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { - if (!delta) - delta = 1; - seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - ca_rtt_us = seq_rtt_us; - } - } + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && + tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) + seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp); + rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) return false; From 3d44de9a10ea2b1658dfaed8ea6d3d7b6e0defbb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:45 +0000 Subject: [PATCH 10/13] tcp: add RTAX_FEATURE_TCP_USEC_TS This new dst feature flag will be used to allow TCP to use usec based timestamps instead of msec ones. ip route .... feature tcp_usec_ts Also document that RTAX_FEATURE_SACK and RTAX_FEATURE_TIMESTAMP are unused. RTAX_FEATURE_ALLFRAG is also going away soon. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 5 +++++ include/uapi/linux/rtnetlink.h | 18 +++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e15452df9804f..04a0e647ef747 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -576,4 +576,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); int tcp_sock_set_user_timeout(struct sock *sk, int val); +static inline bool dst_tcp_usec_ts(const struct dst_entry *dst) +{ + return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS); +} + #endif /* _LINUX_TCP_H */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 51c13cf9c5aee..aa2482a0614aa 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -502,13 +502,17 @@ enum { #define RTAX_MAX (__RTAX_MAX - 1) -#define RTAX_FEATURE_ECN (1 << 0) -#define RTAX_FEATURE_SACK (1 << 1) -#define RTAX_FEATURE_TIMESTAMP (1 << 2) -#define RTAX_FEATURE_ALLFRAG (1 << 3) - -#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ - RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) /* unused */ +#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ +#define RTAX_FEATURE_ALLFRAG (1 << 3) +#define RTAX_FEATURE_TCP_USEC_TS (1 << 4) + +#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ + RTAX_FEATURE_TCP_USEC_TS) struct rta_session { __u8 proto; From af7721448a609d1912b57c825194ef6e17fc71a4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:46 +0000 Subject: [PATCH 11/13] tcp: introduce TCP_PAWS_WRAP tcp_paws_check() uses TCP_PAWS_24DAYS constant to detect if TCP TS values might have wrapped after a long idle period. This mechanism is described in RFC 7323 5.5 (Outdated Timestamps) TCP_PAWS_24DAYS value was based on the assumption of a clock of 1 Khz. As we want to adopt a 1 Mhz clock in the future, we reduce this constant. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index af72c1dc37f3d..0ab577869d7ac 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -166,7 +166,12 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 -#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) +/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds + * to avoid overflows. This assumes a clock smaller than 1 Mhz. + * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz. + */ +#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC) + #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN @@ -1619,7 +1624,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; if (unlikely(!time_before32(ktime_get_seconds(), - rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) + rx_opt->ts_recent_stamp + TCP_PAWS_WRAP))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, From 614e8316aa4cafba3e204cb8ee48bd12b92f3d93 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:47 +0000 Subject: [PATCH 12/13] tcp: add support for usec resolution in TCP TS values Back in 2015, Van Jacobson suggested to use usec resolution in TCP TS values. This has been implemented in our private kernels. Goals were : 1) better observability of delays in networking stacks. 2) better disambiguation of events based on TSval/ecr values. 3) building block for congestion control modules needing usec resolution. Back then we implemented a schem based on private SYN options to negotiate the feature. For upstream submission, we chose to use a route attribute, because this feature is probably going to be used in private networks [1] [2]. ip route add 10/8 ... features tcp_usec_ts Note that RFC 7323 recommends a "timestamp clock frequency in the range 1 ms to 1 sec per tick.", but also mentions "the maximum acceptable clock frequency is one tick every 59 ns." [1] Unfortunately RFC 7323 5.5 (Outdated Timestamps) suggests to invalidate TS.Recent values after a flow was idle for more than 24 days. This is the part making usec_ts a problem for peers following this recommendation for long living idle flows. [2] Attempts to standardize usec ts went nowhere: https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 +++- include/net/inet_timewait_sock.h | 3 ++- include/net/tcp.h | 6 +++-- net/ipv4/syncookies.c | 6 ++++- net/ipv4/tcp.c | 18 ++++++++++---- net/ipv4/tcp_input.c | 5 +++- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 19 +++++++++++---- net/ipv4/tcp_output.c | 12 ++++++---- net/ipv4/tcp_timer.c | 40 +++++++++++++++++++++----------- net/ipv6/tcp_ipv6.c | 1 + 11 files changed, 82 insertions(+), 33 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 04a0e647ef747..6df715b6e51d4 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -152,6 +152,7 @@ struct tcp_request_sock { u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; + s8 req_usec_ts; #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif @@ -257,7 +258,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + tcp_usec_ts:1, /* TSval values in usec */ + unused:4; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 4a8e578405cb3..b14999ff55db1 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -67,7 +67,8 @@ struct inet_timewait_sock { /* And these are ours. */ unsigned int tw_transparent : 1, tw_flowlabel : 20, - tw_pad : 3, /* 3 bits hole */ + tw_usec_ts : 1, + tw_pad : 2, /* 2 bits hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ab577869d7ac..39b731c900dd5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -825,6 +825,8 @@ static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp) { + if (tp->tcp_usec_ts) + return tp->tcp_mstamp; return tcp_time_stamp_ms(tp); } @@ -852,12 +854,12 @@ static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb) static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) { - return tcp_clock_ts(false) + tcptw->tw_ts_offset; + return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset; } static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) { - return tcp_clock_ts(false) + treq->ts_off; + return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off; } #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 62395fdb0ca55..c643343632309 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -84,7 +84,9 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now) if (ts > ts_now) ts -= (1UL << TSBITS); - return ts * (NSEC_PER_SEC / TCP_TS_HZ); + if (tcp_rsk(req)->req_usec_ts) + return ts * NSEC_PER_USEC; + return ts * NSEC_PER_MSEC; } @@ -304,6 +306,8 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, treq->af_specific = af_ops; treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; + treq->req_usec_ts = -1; + #if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 805f8341064fe..b961364b4961c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3629,10 +3629,16 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, tp->fastopen_no_cookie = val; break; case TCP_TIMESTAMP: - if (!tp->repair) + if (!tp->repair) { err = -EPERM; - else - WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(false)); + break; + } + /* val is an opaque field, + * and low order bit contains usec_ts enable bit. + * Its a best effort, and we do not care if user makes an error. + */ + tp->tcp_usec_ts = val & 1; + WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts)); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); @@ -4143,7 +4149,11 @@ int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TIMESTAMP: - val = tcp_clock_ts(false) + READ_ONCE(tp->tsoffset); + val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset); + if (tp->tcp_usec_ts) + val |= 1; + else + val &= ~1; break; case TCP_NOTSENT_LOWAT: val = READ_ONCE(tp->notsent_lowat); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5666f61371678..18b858597af4e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -698,6 +698,8 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) u32 delta, delta_us; delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; + if (tp->tcp_usec_ts) + return delta; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) @@ -2452,7 +2454,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, const struct sk_buff *skb) { return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && - tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(false, skb)); + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); } /* Nothing was retransmitted or returned timestamp is less @@ -7045,6 +7047,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; + tcp_rsk(req)->req_usec_ts = -1; #if IS_ENABLED(CONFIG_MPTCP) tcp_rsk(req)->is_mptcp = 0; #endif diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cdd65cc594bc4..7583d4e34c8c5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -296,6 +296,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = NULL; goto failure; } + tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a9fdba897a28f..ace806c5bd0cd 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -300,6 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; + tw->tw_usec_ts = tp->tcp_usec_ts; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; tw->tw_txhash = sk->sk_txhash; @@ -554,21 +555,29 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { + newtp->tcp_usec_ts = treq->req_usec_ts; newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { + newtp->tcp_usec_ts = 0; newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } if (req->num_timeout) { - newtp->undo_marker = treq->snt_isn; - newtp->retrans_stamp = div_u64(treq->snt_synack, - USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto = req->num_timeout; + newtp->undo_marker = treq->snt_isn; + if (newtp->tcp_usec_ts) { + newtp->retrans_stamp = treq->snt_synack; + newtp->total_rto_time = (u32)(tcp_clock_us() - + newtp->retrans_stamp) / USEC_PER_MSEC; + } else { + newtp->retrans_stamp = div_u64(treq->snt_synack, + USEC_PER_SEC / TCP_TS_HZ); + newtp->total_rto_time = tcp_clock_ms() - + newtp->retrans_stamp; + } newtp->total_rto_recoveries = 1; - newtp->total_rto_time = tcp_clock_ms() - - newtp->retrans_stamp; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a1fec8be9ac36..2866ccbccde07 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -799,7 +799,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp_ts(false, skb) + tp->tsoffset; + opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -884,7 +884,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp_ts(false, skb) + tcp_rsk(req)->ts_off; + opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + + tcp_rsk(req)->ts_off; opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -943,7 +944,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = skb ? tcp_skb_timestamp_ts(false, skb) + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -3379,7 +3381,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_skb_timestamp_ts(false, skb); + tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; @@ -3665,6 +3667,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + if (tcp_rsk(req)->req_usec_ts < 0) + tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index bfcf3fe44c724..1f9f6c1c196b2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -26,14 +26,18 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 elapsed, start_ts, user_timeout; + const struct tcp_sock *tp = tcp_sk(sk); + u32 elapsed, user_timeout; s32 remaining; - start_ts = tcp_sk(sk)->retrans_stamp; user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout) return icsk->icsk_rto; - elapsed = tcp_time_stamp_ts(tcp_sk(sk)) - start_ts; + + elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp; + if (tp->tcp_usec_ts) + elapsed /= USEC_PER_MSEC; + remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ @@ -212,12 +216,13 @@ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, unsigned int timeout) { - unsigned int start_ts; + struct tcp_sock *tp = tcp_sk(sk); + unsigned int start_ts, delta; if (!inet_csk(sk)->icsk_retransmits) return false; - start_ts = tcp_sk(sk)->retrans_stamp; + start_ts = tp->retrans_stamp; if (likely(timeout == 0)) { unsigned int rto_base = TCP_RTO_MIN; @@ -226,7 +231,12 @@ static bool retransmits_timed_out(struct sock *sk, timeout = tcp_model_timeout(sk, boundary, rto_base); } - return (s32)(tcp_time_stamp_ts(tcp_sk(sk)) - start_ts - timeout) >= 0; + if (tp->tcp_usec_ts) { + /* delta maybe off up to a jiffy due to timer granularity. */ + delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1); + return (s32)(delta - timeout * USEC_PER_MSEC) >= 0; + } + return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ @@ -468,20 +478,18 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) } static bool tcp_rtx_probe0_timed_out(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + u32 rtx_delta) { const struct tcp_sock *tp = tcp_sk(sk); const int timeout = TCP_RTO_MAX * 2; - u32 rcv_delta, rtx_delta; + u32 rcv_delta; rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; - rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp_ts(tp) - - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb))); - - return rtx_delta > timeout; + return msecs_to_jiffies(rtx_delta) > timeout; } /** @@ -534,7 +542,11 @@ void tcp_retransmit_timer(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 rtx_delta; - rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)); + rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: + tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); + if (tp->tcp_usec_ts) + rtx_delta /= USEC_PER_MSEC; + if (sk->sk_family == AF_INET) { net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &inet->inet_daddr, ntohs(inet->inet_dport), @@ -551,7 +563,7 @@ void tcp_retransmit_timer(struct sock *sk) rtx_delta); } #endif - if (tcp_rtx_probe0_timed_out(sk, skb)) { + if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) { tcp_write_err(sk); goto out; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1ee6517e9b2f9..0c8a14ba104f2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -286,6 +286,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, goto failure; } + tp->tcp_usec_ts = dst_tcp_usec_ts(dst); tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { From a77a0f5c7f23a8a4981a2a3ff47baa91ceaf1f53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:48 +0000 Subject: [PATCH 13/13] tcp: add TCPI_OPT_USEC_TS Add the ability to report in tcp_info.tcpi_options if a flow is using usec resolution in TCP TS val. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index d1d08da6331ab..8aa3916e14f6d 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail { #define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ +#define TCPI_OPT_USEC_TS 64 /* usec timestamps */ /* * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b961364b4961c..a86d8200a1e86 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3760,6 +3760,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) + info->tcpi_options |= TCPI_OPT_USEC_TS; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,