Skip to content

Commit

Permalink
Merge branch 'tcp-switch-to-Early-Departure-Time-model'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: switch to Early Departure Time model

In the early days, pacing has been implemented in sch_fq (FQ)
in a generic way :

- SO_MAX_PACING_RATE could be used by any sockets.

- TCP would vary effective pacing rate based on CWND*MSS/SRTT

- FQ would ensure delays between packets based on current
  sk->sk_pacing_rate, but with some quantum based artifacts.
  (inflating RPC tail latencies)

- BBR then tweaked the pacing rate in its various phases
  (PROBE, DRAIN, ...)

This worked reasonably well, but had the side effect that TCP RTT
samples would be inflated by the sojourn time of the packets in FQ.

Also note that when FQ is not used and TCP wants pacing, the
internal pacing fallback has very different behavior, since TCP
emits packets at the time they should be sent (with unreasonable
assumptions about scheduling costs)

Van Jacobson gave a talk at Netdev 0x12 in Montreal, about letting
TCP (or applications for UDP messages) decide of the Earliest
Departure Time, instead of letting packet schedulers derive it
from pacing rate.

https://www.netdevconf.org/0x12/session.html?evolving-from-afap-teaching-nics-about-time
https://www.files.netdevconf.org/d/46def75c2ef345809bbe/files/?p=/Evolving%20from%20AFAP%20%E2%80%93%20Teaching%20NICs%20about%20time.pdf

Recent additions in linux provided SO_TXTIME and a new ETF qdisc
supporting the new skb->tstamp role

This patch series converts TCP and FQ to the same model.

This might in the future allow us to relax tight TSQ limits
(if FQ is present in the output path), and thus lower
number of callbacks to tcp_write_xmit(), thanks to batching.

This will be followed by FQ change allowing SO_TXTIME support
so that QUIC servers can let the pacing being done in FQ (or
offloaded if network device permits)

For example, a TCP flow rated at 24Mbps now shows a more meaningful RTT

Before :

ESTAB  0  211408 10.246.7.151:41558   10.246.7.152:33723
	 cubic wscale:8,8 rto:203 rtt:2.195/0.084 mss:1448 rcvmss:536
  advmss:1448 cwnd:20 ssthresh:20 bytes_acked:36897937
  segs_out:25488 segs_in:12454 data_segs_out:25486
  send 105.5Mbps lastsnd:1 lastrcv:12851 lastack:1
  pacing_rate 24.0Mbps/24.0Mbps delivery_rate 22.9Mbps
  busy:12851ms unacked:4 rcv_space:29200 notsent:205616 minrtt:0.026

After :

ESTAB  0  192584 10.246.7.151:61612   10.246.7.152:34375
	 cubic wscale:8,8 rto:201 rtt:0.165/0.129 mss:1448 rcvmss:536
  advmss:1448 cwnd:20 ssthresh:20 bytes_acked:170755401
  segs_out:117931 segs_in:57651 data_segs_out:117929
  send 1404.1Mbps lastsnd:1 lastrcv:56915 lastack:1
  pacing_rate 24.0Mbps/24.0Mbps delivery_rate 24.2Mbps
  busy:56915ms unacked:4 rcv_space:29200 notsent:186792 minrtt:0.054

A nice side effect of this patch series is a reduction of max/p99
latencies of RPC workloads, since the FQ quantum no longer adds
artifact.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Sep 22, 2018
2 parents 4f4b93a + 90caf67 commit a88e24f
Show file tree
Hide file tree
Showing 13 changed files with 103 additions and 128 deletions.
2 changes: 1 addition & 1 deletion include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ struct sk_buff {

union {
ktime_t tstamp;
u64 skb_mstamp;
u64 skb_mstamp_ns; /* earliest departure time */
};
/*
* This is the control buffer. It is free to use for every
Expand Down
2 changes: 2 additions & 0 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,8 @@ struct tcp_sock {
syn_smc:1; /* SYN includes SMC */
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */

u64 tcp_wstamp_ns; /* departure time for next sent data packet */

/* RTT measurement */
u64 tcp_mstamp; /* most recent packet received/sent */
u32 srtt_us; /* smoothed round trip time << 3 in usecs */
Expand Down
26 changes: 11 additions & 15 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,7 @@ void tcp_send_window_probe(struct sock *sk);

static inline u64 tcp_clock_ns(void)
{
return local_clock();
return ktime_get_tai_ns();
}

static inline u64 tcp_clock_us(void)
Expand All @@ -752,17 +752,7 @@ static inline u32 tcp_time_stamp_raw(void)
return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
}


/* Refresh 1us clock of a TCP socket,
* ensuring monotically increasing values.
*/
static inline void tcp_mstamp_refresh(struct tcp_sock *tp)
{
u64 val = tcp_clock_us();

if (val > tp->tcp_mstamp)
tp->tcp_mstamp = val;
}
void tcp_mstamp_refresh(struct tcp_sock *tp);

static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
{
Expand All @@ -771,7 +761,13 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)

static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ);
return div_u64(skb->skb_mstamp_ns, NSEC_PER_SEC / TCP_TS_HZ);
}

/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}


Expand Down Expand Up @@ -817,7 +813,7 @@ struct tcp_skb_cb {
#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
#define TCPCB_LOST 0x04 /* SKB is lost */
#define TCPCB_TAGBITS 0x07 /* All tag bits */
#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp) */
#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */
#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
TCPCB_REPAIRED)
Expand Down Expand Up @@ -1940,7 +1936,7 @@ static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
const struct sk_buff *skb = tcp_rtx_queue_head(sk);
u32 rto = inet_csk(sk)->icsk_rto;
u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);

return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
}
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/syncookies.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req)
ts <<= TSBITS;
ts |= options;
}
return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ);
return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
}


Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1295,7 +1295,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
copy = size_goal;

/* All packets are restored as if they have
* already been sent. skb_mstamp isn't set to
* already been sent. skb_mstamp_ns isn't set to
* avoid wrong rtt estimation.
*/
if (tp->repair)
Expand Down
7 changes: 4 additions & 3 deletions net/ipv4/tcp_bbr.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000;

/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
static const int bbr_pacing_marging_percent = 1;

/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
* that will allow a smoothly increasing pacing rate that will double each RTT
* and send the same number of packets per RTT that an un-paced, slow-starting
Expand Down Expand Up @@ -208,12 +211,10 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
{
unsigned int mss = tcp_sk(sk)->mss_cache;

if (!tcp_needs_internal_pacing(sk))
mss = tcp_mss_to_mtu(sk, mss);
rate *= mss;
rate *= gain;
rate >>= BBR_SCALE;
rate *= USEC_PER_SEC;
rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_marging_percent);
return rate >> BW_SCALE;
}

Expand Down
11 changes: 6 additions & 5 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -1305,7 +1305,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
skb->skb_mstamp);
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);

if (skb == tp->lost_skb_hint)
Expand Down Expand Up @@ -1580,7 +1580,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
skb->skb_mstamp);
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
list_del_init(&skb->tcp_tsorted_anchor);
Expand Down Expand Up @@ -3103,7 +3103,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
} else if (!(sacked & TCPCB_SACKED_ACKED)) {
last_ackt = skb->skb_mstamp;
last_ackt = tcp_skb_timestamp_us(skb);
WARN_ON_ONCE(last_ackt == 0);
if (!first_ackt)
first_ackt = last_ackt;
Expand All @@ -3121,7 +3121,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, sacked, scb->end_seq,
skb->skb_mstamp);
tcp_skb_timestamp_us(skb));
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
Expand Down Expand Up @@ -3215,7 +3215,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
tcp_skb_timestamp_us(skb))) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
BUG_ON(!skb);

tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
remaining = icsk->icsk_rto -
usecs_to_jiffies(delta_us);

Expand Down
68 changes: 44 additions & 24 deletions net/ipv4/tcp_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,22 @@

#include <trace/events/tcp.h>

/* Refresh clocks of a TCP socket,
* ensuring monotically increasing values.
*/
void tcp_mstamp_refresh(struct tcp_sock *tp)
{
u64 val = tcp_clock_ns();

/* departure time for next data packet */
if (val > tp->tcp_wstamp_ns)
tp->tcp_wstamp_ns = val;

val = div_u64(val, NSEC_PER_USEC);
if (val > tp->tcp_mstamp)
tp->tcp_mstamp = val;
}

static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);

Expand Down Expand Up @@ -977,28 +993,34 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}

static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
static void tcp_internal_pacing(struct sock *sk)
{
u64 len_ns;
u32 rate;

if (!tcp_needs_internal_pacing(sk))
return;
rate = sk->sk_pacing_rate;
if (!rate || rate == ~0U)
return;

len_ns = (u64)skb->len * NSEC_PER_SEC;
do_div(len_ns, rate);
hrtimer_start(&tcp_sk(sk)->pacing_timer,
ktime_add_ns(ktime_get(), len_ns),
ns_to_ktime(tcp_sk(sk)->tcp_wstamp_ns),
HRTIMER_MODE_ABS_PINNED_SOFT);
sock_hold(sk);
}

static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb)
{
skb->skb_mstamp = tp->tcp_mstamp;
struct tcp_sock *tp = tcp_sk(sk);

skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (sk->sk_pacing_status != SK_PACING_NONE) {
u32 rate = sk->sk_pacing_rate;

/* Original sch_fq does not pace first 10 MSS
* Note that tp->data_segs_out overflows after 2^32 packets,
* this is a minor annoyance.
*/
if (rate != ~0U && rate && tp->data_segs_out >= 10) {
tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate);

tcp_internal_pacing(sk);
}
}
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}

Expand Down Expand Up @@ -1045,7 +1067,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(!skb))
return -ENOBUFS;
}
skb->skb_mstamp = tp->tcp_mstamp;
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;

inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
Expand Down Expand Up @@ -1137,7 +1159,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
tp->bytes_sent += skb->len - tcp_header_size;
tcp_internal_pacing(sk, skb);
}

if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
Expand All @@ -1149,8 +1170,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);

/* Our usage of tstamp should remain private */
skb->tstamp = 0;
/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */

/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
Expand All @@ -1163,7 +1183,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
err = net_xmit_eval(err);
}
if (!err && oskb) {
tcp_update_skb_after_send(tp, oskb);
tcp_update_skb_after_send(sk, oskb);
tcp_rate_skb_sent(sk, oskb);
}
return err;
Expand Down Expand Up @@ -1966,7 +1986,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
/* If next ACK is likely to come too late (half srtt), do not defer */
if (age < (tp->srtt_us >> 4))
goto send_now;
Expand Down Expand Up @@ -2312,7 +2332,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp" is used as a start point for the retransmit timer */
tcp_update_skb_after_send(tp, skb);
tcp_update_skb_after_send(sk, skb);
goto repair; /* Skip network transmission */
}

Expand Down Expand Up @@ -2887,7 +2907,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
} tcp_skb_tsorted_restore(skb);

if (!err) {
tcp_update_skb_after_send(tp, skb);
tcp_update_skb_after_send(sk, skb);
tcp_rate_skb_sent(sk, skb);
}
} else {
Expand Down Expand Up @@ -3205,10 +3225,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
skb->skb_mstamp = cookie_init_timestamp(req);
skb->skb_mstamp_ns = cookie_init_timestamp(req);
else
#endif
skb->skb_mstamp = tcp_clock_us();
skb->skb_mstamp_ns = tcp_clock_ns();

#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
Expand Down Expand Up @@ -3424,7 +3444,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)

err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);

syn->skb_mstamp = syn_data->skb_mstamp;
syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;

/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)
Expand Down
15 changes: 8 additions & 7 deletions net/ipv4/tcp_rate.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
* bandwidth estimate.
*/
if (!tp->packets_out) {
tp->first_tx_mstamp = skb->skb_mstamp;
tp->delivered_mstamp = skb->skb_mstamp;
u64 tstamp_us = tcp_skb_timestamp_us(skb);

tp->first_tx_mstamp = tstamp_us;
tp->delivered_mstamp = tstamp_us;
}

TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
Expand Down Expand Up @@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS;

/* Record send time of most recently ACKed packet: */
tp->first_tx_mstamp = tcp_skb_timestamp_us(skb);
/* Find the duration of the "send phase" of this window: */
rs->interval_us = tcp_stamp_us_delta(
skb->skb_mstamp,
scb->tx.first_tx_mstamp);
rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
scb->tx.first_tx_mstamp);

/* Record send time of most recently ACKed packet: */
tp->first_tx_mstamp = skb->skb_mstamp;
}
/* Mark off the skb delivered once it's sacked to avoid being
* used again when it's cumulatively acked. For acked packets
Expand Down
5 changes: 3 additions & 2 deletions net/ipv4/tcp_recovery.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
{
return tp->rack.rtt_us + reo_wnd -
tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
}

/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
Expand Down Expand Up @@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;

if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
if (!tcp_rack_sent_after(tp->rack.mstamp,
tcp_skb_timestamp_us(skb),
tp->rack.end_seq, scb->end_seq))
break;

Expand Down
4 changes: 2 additions & 2 deletions net/ipv4/tcp_timer.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk)
*/
start_ts = tcp_skb_timestamp(skb);
if (!start_ts)
skb->skb_mstamp = tp->tcp_mstamp;
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
else if (icsk->icsk_user_timeout &&
(s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
goto abort;
Expand Down Expand Up @@ -758,7 +758,7 @@ void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_TAI,
HRTIMER_MODE_ABS_PINNED_SOFT);
tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;

Expand Down
Loading

0 comments on commit a88e24f

Please sign in to comment.