diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 1b8c964b0d175..4412f695a62f4 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -388,6 +388,16 @@ tcp_mtu_probing - INTEGER 1 - Disabled by default, enabled when an ICMP black hole detected 2 - Always enabled, use initial MSS of tcp_base_mss. +tcp_probe_interval - INTEGER + Controls how often to start TCP Packetization-Layer Path MTU + Discovery reprobe. The default is reprobing every 10 minutes as + per RFC4821. + +tcp_probe_threshold - INTEGER + Controls when TCP Packetization-Layer Path MTU Discovery probing + will stop in respect to the width of search range in bytes. Default + is 8 bytes. + tcp_no_metrics_save - BOOLEAN By default, TCP saves various connection metrics in the route cache when the connection closes, so that connections established in the diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 5976bdecf58b0..b9a6b0a94cc6b 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -126,6 +126,8 @@ struct inet_connection_sock { /* Information on the current probe. */ int probe_size; + + u32 probe_timestamp; } icsk_mtup; u32 icsk_ca_priv[16]; u32 icsk_user_timeout; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1085e12f940f0..8f3a1a1a5a94e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -87,6 +87,8 @@ struct netns_ipv4 { int sysctl_tcp_fwmark_accept; int sysctl_tcp_mtu_probing; int sysctl_tcp_base_mss; + int sysctl_tcp_probe_threshold; + u32 sysctl_tcp_probe_interval; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index f87599d5af823..2e11e38205c22 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -65,7 +65,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_MIN_MSS 88U /* The least MTU to use for probing */ -#define TCP_BASE_MSS 512 +#define TCP_BASE_MSS 1024 + +/* probing interval, default to 10 minutes as per RFC4821 */ +#define TCP_PROBE_INTERVAL 600 + +/* Specify interval when tcp mtu probing will stop */ +#define TCP_PROBE_THRESHOLD 8 /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d151539da8e69..fdf899163d441 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -883,6 +883,20 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_probe_threshold", + .data = &init_net.ipv4.sysctl_tcp_probe_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_probe_interval", + .data = &init_net.ipv4.sysctl_tcp_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5a2dfed4783b6..f0c6fc32bfa83 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2460,6 +2460,8 @@ static int __net_init tcp_sk_init(struct net *net) } net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; + net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; + net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; return 0; fail: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8bbd86cd81c82..5a73ad5afaf72 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; + if (icsk->icsk_mtup.enabled) + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; } EXPORT_SYMBOL(tcp_mtup_init); @@ -1828,6 +1830,31 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, return false; } +static inline void tcp_mtu_check_reprobe(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + u32 interval; + s32 delta; + + interval = net->ipv4.sysctl_tcp_probe_interval; + delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; + if (unlikely(delta >= interval * HZ)) { + int mss = tcp_current_mss(sk); + + /* Update current search range */ + icsk->icsk_mtup.probe_size = 0; + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + + sizeof(struct tcphdr) + + icsk->icsk_af_ops->net_header_len; + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + + /* Update probe time stamp */ + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + } +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -1842,11 +1869,13 @@ static int tcp_mtu_probe(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *nskb, *next; + struct net *net = sock_net(sk); int len; int probe_size; int size_needed; int copy; int mss_now; + int interval; /* Not currently probing/verifying, * not in recovery, @@ -1859,12 +1888,25 @@ static int tcp_mtu_probe(struct sock *sk) tp->rx_opt.num_sacks || tp->rx_opt.dsack) return -1; - /* Very simple search strategy: just double the MSS. */ + /* Use binary search for probe_size between tcp_mss_base, + * and current mss_clamp. if (search_high - search_low) + * smaller than a threshold, backoff from probing. + */ mss_now = tcp_current_mss(sk); - probe_size = 2 * tp->mss_cache; + probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; - if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { - /* TODO: set timer for probe_converge_event */ + interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; + /* When misfortune happens, we are reprobing actively, + * and then reprobe timer has expired. We stick with current + * probing process by not resetting search range to its orignal. + */ + if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || + interval < net->ipv4.sysctl_tcp_probe_threshold) { + /* Check whether enough time has elaplased for + * another round of probing. + */ + tcp_mtu_check_reprobe(sk); return -1; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0732b787904ed..15505936511d4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct net *net = sock_net(sk);