Skip to content

Commit

Permalink
Merge branch 'tcp-do-not-use-tcp_time_stamp-for-rcv-autotuning'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: do not use tcp_time_stamp for rcv autotuning

Some devices or linux distributions use HZ=100 or HZ=250

TCP receive buffer autotuning has poor behavior caused by this choice.
Since autotuning happens after 4 ms or 10 ms, short distance flows
get their receive buffer tuned to a very high value, but after an initial
period where it was frozen to (too small) initial value.

With BBR (or other CC allowing to increase BDP), we are willing to
increase tcp_rmem[2], but this receive autotuning defect is a blocker
for hosts dealing with gazillions of TCP flows in the data centers,
since many of them have inflated RCVBUF. Risk of OOM is too high.

Note that TSO autodefer, tcp cubic, and TCP TS options (RFC 7323)
also suffer from our dependency to jiffies (via tcp_time_stamp).

We have ongoing efforts to improve all that in the future.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Apr 26, 2017
2 parents 038a3e8 + 645f4c6 commit 4629498
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 61 deletions.
13 changes: 7 additions & 6 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ struct tcp_sock {
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */

/* RTT measurement */
struct skb_mstamp tcp_mstamp; /* most recent packet received/sent */
u32 srtt_us; /* smoothed round trip time << 3 in usecs */
u32 mdev_us; /* medium deviation */
u32 mdev_max_us; /* maximal mdev for the last rtt period */
Expand Down Expand Up @@ -332,16 +333,16 @@ struct tcp_sock {

/* Receiver side RTT estimation */
struct {
u32 rtt;
u32 seq;
u32 time;
u32 rtt_us;
u32 seq;
struct skb_mstamp time;
} rcv_rtt_est;

/* Receiver queue space */
struct {
int space;
u32 seq;
u32 time;
int space;
u32 seq;
struct skb_mstamp time;
} rcvq_space;

/* TCP-specific MTU probe information. */
Expand Down
7 changes: 3 additions & 4 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1004,7 +1004,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
struct rate_sample *rs);
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
struct skb_mstamp *now, struct rate_sample *rs);
struct rate_sample *rs);
void tcp_rate_check_app_limited(struct sock *sk);

/* These functions determine how the current flow behaves in respect of SACK
Expand Down Expand Up @@ -1853,10 +1853,9 @@ void tcp_v4_init(void);
void tcp_init(void);

/* tcp_recovery.c */
extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
extern void tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
const struct skb_mstamp *xmit_time,
const struct skb_mstamp *ack_time);
const struct skb_mstamp *xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk);

/*
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2853,7 +2853,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
info->tcpi_advmss = tp->advmss;

info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
info->tcpi_rcv_space = tp->rcvq_space.space;

info->tcpi_total_retrans = tp->total_retrans;
Expand Down
69 changes: 33 additions & 36 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,8 @@ void tcp_init_buffer_space(struct sock *sk)
tcp_sndbuf_expand(sk);

tp->rcvq_space.space = tp->rcv_wnd;
tp->rcvq_space.time = tcp_time_stamp;
skb_mstamp_get(&tp->tcp_mstamp);
tp->rcvq_space.time = tp->tcp_mstamp;
tp->rcvq_space.seq = tp->copied_seq;

maxwin = tcp_full_space(sk);
Expand Down Expand Up @@ -518,7 +519,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
*/
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
u32 new_sample = tp->rcv_rtt_est.rtt;
u32 new_sample = tp->rcv_rtt_est.rtt_us;
long m = sample;

if (m == 0)
Expand Down Expand Up @@ -548,21 +549,23 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
new_sample = m << 3;
}

if (tp->rcv_rtt_est.rtt != new_sample)
tp->rcv_rtt_est.rtt = new_sample;
tp->rcv_rtt_est.rtt_us = new_sample;
}

static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
if (tp->rcv_rtt_est.time == 0)
u32 delta_us;

if (tp->rcv_rtt_est.time.v64 == 0)
goto new_measure;
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time);
tcp_rcv_rtt_update(tp, delta_us, 1);

new_measure:
tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
tp->rcv_rtt_est.time = tcp_time_stamp;
tp->rcv_rtt_est.time = tp->tcp_mstamp;
}

static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
Expand All @@ -572,7 +575,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
if (tp->rx_opt.rcv_tsecr &&
(TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
tcp_rcv_rtt_update(tp,
jiffies_to_usecs(tcp_time_stamp -
tp->rx_opt.rcv_tsecr),
0);
}

/*
Expand All @@ -585,8 +591,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
int time;
int copied;

time = tcp_time_stamp - tp->rcvq_space.time;
if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time);
if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
return;

/* Number of bytes copied to user in last RTT */
Expand Down Expand Up @@ -642,7 +648,7 @@ void tcp_rcv_space_adjust(struct sock *sk)

new_measure:
tp->rcvq_space.seq = tp->copied_seq;
tp->rcvq_space.time = tcp_time_stamp;
tp->rcvq_space.time = tp->tcp_mstamp;
}

/* There is something which you must keep in mind when you analyze the
Expand Down Expand Up @@ -1131,7 +1137,6 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
struct rate_sample *rate;
int flag;
};
Expand Down Expand Up @@ -1214,8 +1219,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;

if (!(sacked & TCPCB_SACKED_ACKED)) {
tcp_rack_advance(tp, sacked, end_seq,
xmit_time, &state->ack_time);
tcp_rack_advance(tp, sacked, end_seq, xmit_time);

if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
Expand Down Expand Up @@ -2760,16 +2764,15 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
return false;
}

static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
const struct skb_mstamp *ack_time)
static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
{
struct tcp_sock *tp = tcp_sk(sk);

/* Use RACK to detect loss */
if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
u32 prior_retrans = tp->retrans_out;

tcp_rack_mark_lost(sk, ack_time);
tcp_rack_mark_lost(sk);
if (prior_retrans > tp->retrans_out)
*ack_flag |= FLAG_LOST_RETRANS;
}
Expand All @@ -2788,8 +2791,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
bool is_dupack, int *ack_flag, int *rexmit,
const struct skb_mstamp *ack_time)
bool is_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
Expand Down Expand Up @@ -2857,11 +2859,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_try_keep_open(sk);
return;
}
tcp_rack_identify_loss(sk, ack_flag, ack_time);
tcp_rack_identify_loss(sk, ack_flag);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
tcp_rack_identify_loss(sk, ack_flag, ack_time);
tcp_rack_identify_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
return;
Expand All @@ -2877,7 +2879,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);

tcp_rack_identify_loss(sk, ack_flag, ack_time);
tcp_rack_identify_loss(sk, ack_flag);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
Expand Down Expand Up @@ -3059,8 +3061,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt;
struct skb_mstamp *now = &sack->ack_time;
struct tcp_sock *tp = tcp_sk(sk);
struct skb_mstamp *now = &tp->tcp_mstamp;
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
bool fully_acked = true;
Expand Down Expand Up @@ -3120,8 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, sacked, scb->end_seq,
&skb->skb_mstamp,
&sack->ack_time);
&skb->skb_mstamp);
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
Expand Down Expand Up @@ -3576,8 +3577,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;

skb_mstamp_get(&sack_state.ack_time);

if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);

Expand Down Expand Up @@ -3647,8 +3646,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)

if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
&sack_state.ack_time);
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
}
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
Expand All @@ -3660,17 +3658,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_schedule_loss_probe(sk);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
sack_state.rate);
tcp_rate_gen(sk, delivered, lost, sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);
return 1;

no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
&sack_state.ack_time);
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
Expand All @@ -3691,11 +3687,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
skb_mstamp_get(&sack_state.ack_time);
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
&sack_state.ack_time);
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
tcp_xmit_recovery(sk, rexmit);
}

Expand Down Expand Up @@ -5362,6 +5356,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);

skb_mstamp_get(&tp->tcp_mstamp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
Expand Down Expand Up @@ -5922,6 +5917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)

case TCP_SYN_SENT:
tp->rx_opt.saw_tstamp = 0;
skb_mstamp_get(&tp->tcp_mstamp);
queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
Expand All @@ -5933,6 +5929,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 0;
}

skb_mstamp_get(&tp->tcp_mstamp);
tp->rx_opt.saw_tstamp = 0;
req = tp->fastopen_rsk;
if (req) {
Expand Down
7 changes: 4 additions & 3 deletions net/ipv4/tcp_rate.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,

/* Update the connection delivery information and generate a rate sample. */
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
struct skb_mstamp *now, struct rate_sample *rs)
struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 snd_us, ack_us;
Expand All @@ -120,7 +120,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
* to carry current time, flags, stats like "tcp_sacktag_state".
*/
if (delivered)
tp->delivered_mstamp = *now;
tp->delivered_mstamp = tp->tcp_mstamp;

rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
rs->losses = lost; /* freshly marked lost */
Expand All @@ -138,7 +138,8 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
* longer phase.
*/
snd_us = rs->interval_us; /* send phase */
ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */
ack_us = skb_mstamp_us_delta(&tp->tcp_mstamp,
&rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);

/* Normally we expect interval_us >= min-rtt.
Expand Down
Loading

0 comments on commit 4629498

Please sign in to comment.