Skip to content

Commit

Permalink
Merge branch 'tcp-ts-usec-resolution'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: add optional usec resolution to TCP TS

As discussed in various public places in 2016, Google adopted
usec resolution in RFC 7323 TS values, at Van Jacobson suggestion.

Goals were :

1) better observability of delays in networking stacks/fabrics.

2) better disambiguation of events based on TSval/ecr values.

3) building block for congestion control modules needing usec resolution.

Back then we implemented a schem based on private SYN options
to safely negotiate the feature.

For upstream submission, we chose to use a much simpler route
attribute because this feature is probably going to be used
in private networks.

ip route add 10/8 ... features tcp_usec_ts

References:

https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf
https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/

First two patches are fixing old minor bugs and might be taken
by stable teams (thanks to appropriate Fixes: tags)
====================

Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Oct 23, 2023
2 parents 35c1b27 + a77a0f5 commit bdf24b4
Show file tree
Hide file tree
Showing 17 changed files with 193 additions and 104 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2259,7 +2259,7 @@ static void chtls_rx_ack(struct sock *sk, struct sk_buff *skb)

if (tp->snd_una != snd_una) {
tp->snd_una = snd_una;
tp->rcv_tstamp = tcp_time_stamp(tp);
tp->rcv_tstamp = tcp_jiffies32;
if (tp->snd_una == tp->snd_nxt &&
!csk_flag_nochk(csk, CSK_TX_FAILOVER))
csk_reset_flag(csk, CSK_TX_WAIT_IDLE);
Expand Down
9 changes: 8 additions & 1 deletion include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ struct tcp_request_sock {
u64 snt_synack; /* first SYNACK sent time */
bool tfo_listener;
bool is_mptcp;
s8 req_usec_ts;
#if IS_ENABLED(CONFIG_MPTCP)
bool drop_req;
#endif
Expand Down Expand Up @@ -257,7 +258,8 @@ struct tcp_sock {
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
unused:5;
tcp_usec_ts:1, /* TSval values in usec */
unused:4;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */
Expand Down Expand Up @@ -576,4 +578,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
int tcp_sock_set_user_timeout(struct sock *sk, int val);

static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
{
return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
}

#endif /* _LINUX_TCP_H */
3 changes: 2 additions & 1 deletion include/net/inet_timewait_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ struct inet_timewait_sock {
/* And these are ours. */
unsigned int tw_transparent : 1,
tw_flowlabel : 20,
tw_pad : 3, /* 3 bits hole */
tw_usec_ts : 1,
tw_pad : 2, /* 2 bits hole */
tw_tos : 8;
u32 tw_txhash;
u32 tw_priority;
Expand Down
59 changes: 43 additions & 16 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,12 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define MAX_TCP_KEEPCNT 127
#define MAX_TCP_SYNCNT 127

#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds
* to avoid overflows. This assumes a clock smaller than 1 Mhz.
* Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz.
*/
#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC)

#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
* after this time. It should be equal
* (or greater than) TCP_TIMEWAIT_LEN
Expand Down Expand Up @@ -798,22 +803,31 @@ static inline u64 tcp_clock_us(void)
return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}

/* This should only be used in contexts where tp->tcp_mstamp is up to date */
static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
static inline u64 tcp_clock_ms(void)
{
return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
return div_u64(tcp_clock_ns(), NSEC_PER_MSEC);
}

/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
static inline u32 tcp_ns_to_ts(u64 ns)
/* TCP Timestamp included in TS option (RFC 1323) can either use ms
* or usec resolution. Each socket carries a flag to select one or other
* resolution, as the route attribute could change anytime.
* Each flow must stick to initial resolution.
*/
static inline u32 tcp_clock_ts(bool usec_ts)
{
return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
return usec_ts ? tcp_clock_us() : tcp_clock_ms();
}

/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
static inline u32 tcp_time_stamp_raw(void)
static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
{
return tcp_ns_to_ts(tcp_clock_ns());
return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
}

static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
{
if (tp->tcp_usec_ts)
return tp->tcp_mstamp;
return tcp_time_stamp_ms(tp);
}

void tcp_mstamp_refresh(struct tcp_sock *tp);
Expand All @@ -823,17 +837,30 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
return max_t(s64, t1 - t0, 0);
}

static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
return tcp_ns_to_ts(skb->skb_mstamp_ns);
}

/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}

/* Provide skb TSval in usec or ms unit */
static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
{
if (usec_ts)
return tcp_skb_timestamp_us(skb);

return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC);
}

static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
{
return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
}

static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
{
return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
}

#define tcp_flag_byte(th) (((u_int8_t *)th)[13])

Expand Down Expand Up @@ -1599,7 +1626,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
return true;
if (unlikely(!time_before32(ktime_get_seconds(),
rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
rx_opt->ts_recent_stamp + TCP_PAWS_WRAP)))
return true;
/*
* Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
Expand Down
18 changes: 11 additions & 7 deletions include/uapi/linux/rtnetlink.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,13 +502,17 @@ enum {

#define RTAX_MAX (__RTAX_MAX - 1)

#define RTAX_FEATURE_ECN (1 << 0)
#define RTAX_FEATURE_SACK (1 << 1)
#define RTAX_FEATURE_TIMESTAMP (1 << 2)
#define RTAX_FEATURE_ALLFRAG (1 << 3)

#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
#define RTAX_FEATURE_ECN (1 << 0)
#define RTAX_FEATURE_SACK (1 << 1) /* unused */
#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */
#define RTAX_FEATURE_ALLFRAG (1 << 3)
#define RTAX_FEATURE_TCP_USEC_TS (1 << 4)

#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \
RTAX_FEATURE_SACK | \
RTAX_FEATURE_TIMESTAMP | \
RTAX_FEATURE_ALLFRAG | \
RTAX_FEATURE_TCP_USEC_TS)

struct rta_session {
__u8 proto;
Expand Down
1 change: 1 addition & 0 deletions include/uapi/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail {
#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */
#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */
#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */
#define TCPI_OPT_USEC_TS 64 /* usec timestamps */

/*
* Sender's congestion state indicating normal or abnormal situations
Expand Down
32 changes: 19 additions & 13 deletions net/ipv4/syncookies.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ static siphash_aligned_key_t syncookie_secret[2];
* requested/supported by the syn/synack exchange.
*/
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)

static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
Expand All @@ -52,6 +51,14 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
count, &syncookie_secret[c]);
}

/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */
static u64 tcp_ns_to_ts(bool usec_ts, u64 val)
{
if (usec_ts)
return div_u64(val, NSEC_PER_USEC);

return div_u64(val, NSEC_PER_MSEC);
}

/*
* when syncookies are in effect and tcp timestamps are enabled we encode
Expand All @@ -62,27 +69,24 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
*/
u64 cookie_init_timestamp(struct request_sock *req, u64 now)
{
struct inet_request_sock *ireq;
u32 ts, ts_now = tcp_ns_to_ts(now);
const struct inet_request_sock *ireq = inet_rsk(req);
u64 ts, ts_now = tcp_ns_to_ts(false, now);
u32 options = 0;

ireq = inet_rsk(req);

options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
if (ireq->sack_ok)
options |= TS_OPT_SACK;
if (ireq->ecn_ok)
options |= TS_OPT_ECN;

ts = ts_now & ~TSMASK;
ts = (ts_now >> TSBITS) << TSBITS;
ts |= options;
if (ts > ts_now) {
ts >>= TSBITS;
ts--;
ts <<= TSBITS;
ts |= options;
}
return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
if (ts > ts_now)
ts -= (1UL << TSBITS);

if (tcp_rsk(req)->req_usec_ts)
return ts * NSEC_PER_USEC;
return ts * NSEC_PER_MSEC;
}


Expand Down Expand Up @@ -302,6 +306,8 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
treq->af_specific = af_ops;

treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
treq->req_usec_ts = -1;

#if IS_ENABLED(CONFIG_MPTCP)
treq->is_mptcp = sk_is_mptcp(sk);
if (treq->is_mptcp) {
Expand Down
26 changes: 18 additions & 8 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -3629,10 +3629,16 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
tp->fastopen_no_cookie = val;
break;
case TCP_TIMESTAMP:
if (!tp->repair)
if (!tp->repair) {
err = -EPERM;
else
WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw());
break;
}
/* val is an opaque field,
* and low order bit contains usec_ts enable bit.
* Its a best effort, and we do not care if user makes an error.
*/
tp->tcp_usec_ts = val & 1;
WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
break;
case TCP_REPAIR_WINDOW:
err = tcp_repair_set_window(tp, optval, optlen);
Expand Down Expand Up @@ -3754,6 +3760,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_options |= TCPI_OPT_ECN_SEEN;
if (tp->syn_data_acked)
info->tcpi_options |= TCPI_OPT_SYN_DATA;
if (tp->tcp_usec_ts)
info->tcpi_options |= TCPI_OPT_USEC_TS;

info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
Expand Down Expand Up @@ -3817,10 +3825,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_rto = tp->total_rto;
info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
info->tcpi_total_rto_time = tp->total_rto_time;
if (tp->rto_stamp) {
info->tcpi_total_rto_time += tcp_time_stamp_raw() -
tp->rto_stamp;
}
if (tp->rto_stamp)
info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;

unlock_sock_fast(sk, slow);
}
Expand Down Expand Up @@ -4145,7 +4151,11 @@ int do_tcp_getsockopt(struct sock *sk, int level,
break;

case TCP_TIMESTAMP:
val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset);
val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
if (tp->tcp_usec_ts)
val |= 1;
else
val &= ~1;
break;
case TCP_NOTSENT_LOWAT:
val = READ_ONCE(tp->notsent_lowat);
Expand Down
Loading

0 comments on commit bdf24b4

Please sign in to comment.