From dcd8fb8533ceb493146ce030d15f7965b82d0c27 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Fri, 6 Mar 2015 11:18:22 +0800 Subject: [PATCH 1/4] ipv4: Raise tcp PMTU probe mss base size Quotes from RFC4821 7.2. Selecting Initial Values It is RECOMMENDED that search_low be initially set to an MTU size that is likely to work over a very wide range of environments. Given today's technologies, a value of 1024 bytes is probably safe enough. The initial value for search_low SHOULD be configurable. Moreover, set a small value will introduce extra time for the search to converge. So set the initial probe base mss size to 1024 Bytes. Signed-off-by: Fan Du Acked-by: John Heffner Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f87599d5af823..834089b0cffc7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -65,7 +65,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_MIN_MSS 88U /* The least MTU to use for probing */ -#define TCP_BASE_MSS 512 +#define TCP_BASE_MSS 1024 /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 From 6b58e0a5f32dedb609438bb9c9c82aa6e23381f2 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Fri, 6 Mar 2015 11:18:23 +0800 Subject: [PATCH 2/4] ipv4: Use binary search to choose tcp PMTU probe_size Current probe_size is chosen by doubling mss_cache, the probing process will end shortly with a sub-optimal mss size, and the link mtu will not be taken full advantage of, in return, this will make user to tweak tcp_base_mss with care. Use binary search to choose probe_size in a fine granularity manner, an optimal mss will be found to boost performance as its maxmium. In addition, introduce a sysctl_tcp_probe_threshold to control when probing will stop in respect to the width of search range. Test env: Docker instance with vxlan encapuslation(82599EB) iperf -c 10.0.0.24 -t 60 before this patch: 1.26 Gbits/sec After this patch: increase 26% 1.59 Gbits/sec Signed-off-by: Fan Du Acked-by: John Heffner Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 +++ net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 14 +++++++++++--- 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1085e12f940f0..e051d399fa170 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -87,6 +87,7 @@ struct netns_ipv4 { int sysctl_tcp_fwmark_accept; int sysctl_tcp_mtu_probing; int sysctl_tcp_base_mss; + int sysctl_tcp_probe_threshold; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 834089b0cffc7..1ad82e334e276 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -67,6 +67,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* The least MTU to use for probing */ #define TCP_BASE_MSS 1024 +/* Specify interval when tcp mtu probing will stop */ +#define TCP_PROBE_THRESHOLD 8 + /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d151539da8e69..d3c09c12ee815 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -883,6 +883,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_probe_threshold", + .data = &init_net.ipv4.sysctl_tcp_probe_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5a2dfed4783b6..35790d977a2b9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2460,6 +2460,7 @@ static int __net_init tcp_sk_init(struct net *net) } net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; + net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; return 0; fail: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8bbd86cd81c82..ed024cbb097f6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1842,11 +1842,13 @@ static int tcp_mtu_probe(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *nskb, *next; + struct net *net = sock_net(sk); int len; int probe_size; int size_needed; int copy; int mss_now; + int interval; /* Not currently probing/verifying, * not in recovery, @@ -1859,11 +1861,17 @@ static int tcp_mtu_probe(struct sock *sk) tp->rx_opt.num_sacks || tp->rx_opt.dsack) return -1; - /* Very simple search strategy: just double the MSS. */ + /* Use binary search for probe_size between tcp_mss_base, + * and current mss_clamp. if (search_high - search_low) + * smaller than a threshold, backoff from probing. + */ mss_now = tcp_current_mss(sk); - probe_size = 2 * tp->mss_cache; + probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; - if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { + interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; + if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || + interval < max(1, net->ipv4.sysctl_tcp_probe_threshold)) { /* TODO: set timer for probe_converge_event */ return -1; } From 05cbc0db03e82128f2e7e353d4194dd24a1627fe Mon Sep 17 00:00:00 2001 From: Fan Du Date: Fri, 6 Mar 2015 11:18:24 +0800 Subject: [PATCH 3/4] ipv4: Create probe timer for tcp PMTU as per RFC4821 As per RFC4821 7.3. Selecting Probe Size, a probe timer should be armed once probing has converged. Once this timer expired, probing again to take advantage of any path PMTU change. The recommended probing interval is 10 minutes per RFC1981. Probing interval could be sysctled by sysctl_tcp_probe_interval. Eric Dumazet suggested to implement pseudo timer based on 32bits jiffies tcp_time_stamp instead of using classic timer for such rare event. Signed-off-by: Fan Du Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 ++ include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 +++ net/ipv4/sysctl_net_ipv4.c | 7 ++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 38 ++++++++++++++++++++++++++++-- net/ipv4/tcp_timer.c | 1 + 7 files changed, 51 insertions(+), 2 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 5976bdecf58b0..b9a6b0a94cc6b 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -126,6 +126,8 @@ struct inet_connection_sock { /* Information on the current probe. */ int probe_size; + + u32 probe_timestamp; } icsk_mtup; u32 icsk_ca_priv[16]; u32 icsk_user_timeout; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e051d399fa170..8f3a1a1a5a94e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -88,6 +88,7 @@ struct netns_ipv4 { int sysctl_tcp_mtu_probing; int sysctl_tcp_base_mss; int sysctl_tcp_probe_threshold; + u32 sysctl_tcp_probe_interval; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 1ad82e334e276..2e11e38205c22 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -67,6 +67,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* The least MTU to use for probing */ #define TCP_BASE_MSS 1024 +/* probing interval, default to 10 minutes as per RFC4821 */ +#define TCP_PROBE_INTERVAL 600 + /* Specify interval when tcp mtu probing will stop */ #define TCP_PROBE_THRESHOLD 8 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d3c09c12ee815..fdf899163d441 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -890,6 +890,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_probe_interval", + .data = &init_net.ipv4.sysctl_tcp_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 35790d977a2b9..f0c6fc32bfa83 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2461,6 +2461,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; + net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; return 0; fail: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ed024cbb097f6..5a73ad5afaf72 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; + if (icsk->icsk_mtup.enabled) + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; } EXPORT_SYMBOL(tcp_mtup_init); @@ -1828,6 +1830,31 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, return false; } +static inline void tcp_mtu_check_reprobe(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + u32 interval; + s32 delta; + + interval = net->ipv4.sysctl_tcp_probe_interval; + delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; + if (unlikely(delta >= interval * HZ)) { + int mss = tcp_current_mss(sk); + + /* Update current search range */ + icsk->icsk_mtup.probe_size = 0; + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + + sizeof(struct tcphdr) + + icsk->icsk_af_ops->net_header_len; + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + + /* Update probe time stamp */ + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + } +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -1870,9 +1897,16 @@ static int tcp_mtu_probe(struct sock *sk) icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; + /* When misfortune happens, we are reprobing actively, + * and then reprobe timer has expired. We stick with current + * probing process by not resetting search range to its orignal. + */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || - interval < max(1, net->ipv4.sysctl_tcp_probe_threshold)) { - /* TODO: set timer for probe_converge_event */ + interval < net->ipv4.sysctl_tcp_probe_threshold) { + /* Check whether enough time has elaplased for + * another round of probing. + */ + tcp_mtu_check_reprobe(sk); return -1; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0732b787904ed..15505936511d4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct net *net = sock_net(sk); From fab42760843734a82b6b2d1241ca44f375a686eb Mon Sep 17 00:00:00 2001 From: Fan Du Date: Fri, 6 Mar 2015 11:18:25 +0800 Subject: [PATCH 4/4] ipv4: Documenting two sysctls for tcp PMTU probe Namely tcp_probe_interval to control how often to restart a probe. And tcp_probe_threshold to control when stop the probing in respect to the width of search range in bytes Signed-off-by: Fan Du Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 1b8c964b0d175..4412f695a62f4 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -388,6 +388,16 @@ tcp_mtu_probing - INTEGER 1 - Disabled by default, enabled when an ICMP black hole detected 2 - Always enabled, use initial MSS of tcp_base_mss. +tcp_probe_interval - INTEGER + Controls how often to start TCP Packetization-Layer Path MTU + Discovery reprobe. The default is reprobing every 10 minutes as + per RFC4821. + +tcp_probe_threshold - INTEGER + Controls when TCP Packetization-Layer Path MTU Discovery probing + will stop in respect to the width of search range in bytes. Default + is 8 bytes. + tcp_no_metrics_save - BOOLEAN By default, TCP saves various connection metrics in the route cache when the connection closes, so that connections established in the