From bef5767f37cfe42b1cf1ae60f01cfb4e16a7a2b8 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:46 -0700 Subject: [PATCH 1/4] tcp: better delivery accounting for SYN-ACK and SYN-data the tcp_sock:delivered has inconsistent accounting for SYN and FIN. 1. it counts pure FIN 2. it counts pure SYN 3. it counts SYN-data twice 4. it does not count SYN-ACK For congestion control perspective it does not matter much as C.C. only cares about the difference not the aboslute value. But the next patch would export this field to user-space so it's better to report the absolute value w/o these caveats. This patch counts SYN, SYN-ACK, or SYN-data delivery once always in the "delivered" field. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f93687f97d805..2499248d4a679 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5567,9 +5567,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, return true; } tp->syn_data_acked = tp->syn_data; - if (tp->syn_data_acked) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVE); + if (tp->syn_data_acked) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + /* SYN-data is counted as two separate packets in tcp_ack() */ + if (tp->delivered > 1) + --tp->delivered; + } tcp_fastopen_add_skb(sk, synack); @@ -5901,6 +5904,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } switch (sk->sk_state) { case TCP_SYN_RECV: + tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */ if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); From a77fa0104abd4348b9ae06ab0afe218ceb2455ea Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:47 -0700 Subject: [PATCH 2/4] tcp: new helper to calculate newly delivered Add new helper tcp_newly_delivered() to prepare the ECN accounting change. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2499248d4a679..01cce28f90caf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3496,6 +3496,16 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) tcp_xmit_retransmit_queue(sk); } +/* Returns the number of packets newly acked or sacked by the current ACK */ +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delivered; + + delivered = tp->delivered - prior_delivered; + return delivered; +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3619,7 +3629,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ + delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3629,9 +3639,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ - if (flag & FLAG_DSACKING_ACK) + if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, &rexmit); + tcp_newly_delivered(sk, delivered, flag); + } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3655,6 +3667,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, &rexmit); + tcp_newly_delivered(sk, delivered, flag); tcp_xmit_recovery(sk, rexmit); } From e21db6f69a95b846ff04e31fe0a86004cbd000d7 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:48 -0700 Subject: [PATCH 3/4] tcp: track total bytes delivered with ECN CE marks Introduce a new delivered_ce stat in tcp socket to estimate number of packets being marked with CE bits. The estimation is done via ACKs with ECE bit. Depending on the actual receiver behavior, the estimation could have biases. Since the TCP sender can't really see the CE bit in the data path, so the sender is technically counting packets marked delivered with the "ECE / ECN-Echo" flag set. With RFC3168 ECN, because the ECE bit is sticky, this count can drastically overestimate the nummber of CE-marked data packets With DCTCP-style ECN this should be reasonably precise unless there is loss in the ACK path, in which case it's not precise. With AccECN proposal this can be made still more precise, even in the case some degree of ACK loss. However this is sender's best estimate of CE information. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8f4c54986f974..20585d5c4e1c3 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -281,6 +281,7 @@ struct tcp_sock { * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 lost; /* Total data packets lost incl. rexmits */ u32 app_limited; /* limited until "delivered" reaches this val */ u64 first_tx_mstamp; /* start of window send phase */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 438fbca96cd31..5a5ce6da4792b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2559,6 +2559,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; + tp->delivered_ce = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 01cce28f90caf..b3bff9c206061 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3503,6 +3503,8 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) u32 delivered; delivered = tp->delivered - prior_delivered; + if (flag & FLAG_ECE) + tp->delivered_ce += delivered; return delivered; } From feb5f2ec646483fb66f9ad7218b1aad2a93a2a5c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:49 -0700 Subject: [PATCH 4/4] tcp: export packets delivery info Export data delivered and delivered with CE marks to 1) SNMP TCPDelivered and TCPDeliveredCE 2) getsockopt(TCP_INFO) 3) Timestamping API SOF_TIMESTAMPING_OPT_STATS Note that for SCM_TSTAMP_ACK, the delivery info in SOF_TIMESTAMPING_OPT_STATS is reported before the info was fully updated on the ACK. These stats help application monitor TCP delivery and ECN status on per host, per connection, even per message level. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 2 ++ include/uapi/linux/tcp.h | 5 +++++ net/ipv4/proc.c | 2 ++ net/ipv4/tcp.c | 7 ++++++- net/ipv4/tcp_input.c | 6 +++++- 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 33a70ece462f0..d02e859301ff4 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -276,6 +276,8 @@ enum LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */ LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */ LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ + LINUX_MIB_TCPDELIVERED, /* TCPDelivered */ + LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 560374c978f90..379b08700a542 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -224,6 +224,9 @@ struct tcp_info { __u64 tcpi_busy_time; /* Time (usec) busy sending data */ __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */ __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + __u32 tcpi_delivered; + __u32 tcpi_delivered_ce; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -244,6 +247,8 @@ enum { TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ TCP_NLA_CA_STATE, /* ca_state of socket */ TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ + TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ + TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index a058de677e947..261b71d0ccc5c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -296,6 +296,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), + SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), + SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5a5ce6da4792b..4022073b0aeea 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3167,6 +3167,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; + info->tcpi_delivered = tp->delivered; + info->tcpi_delivered_ce = tp->delivered_ce; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3180,7 +3182,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 5 * nla_total_size(sizeof(u32)) + + 7 * nla_total_size(sizeof(u32)) + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3211,9 +3213,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); + nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); + nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b3bff9c206061..0396fb919b5d5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3499,12 +3499,16 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) /* Returns the number of packets newly acked or sacked by the current ACK */ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) { + const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); u32 delivered; delivered = tp->delivered - prior_delivered; - if (flag & FLAG_ECE) + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); + if (flag & FLAG_ECE) { tp->delivered_ce += delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + } return delivered; }