Skip to content

Commit

Permalink
[TCP]: MTU probing
Browse files Browse the repository at this point in the history
Implementation of packetization layer path mtu discovery for TCP, based on
the internet-draft currently found at
<http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>.

Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
John Heffner authored and David S. Miller committed Mar 21, 2006
1 parent 1d60290 commit 5d424d5
Show file tree
Hide file tree
Showing 9 changed files with 326 additions and 37 deletions.
2 changes: 2 additions & 0 deletions include/linux/sysctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,8 @@ enum
NET_TCP_CONG_CONTROL=110,
NET_TCP_ABC=111,
NET_IPV4_IPFRAG_MAX_DIST=112,
NET_TCP_MTU_PROBING=113,
NET_TCP_BASE_MSS=114,
};

enum {
Expand Down
13 changes: 13 additions & 0 deletions include/net/inet_connection_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops {
* @icsk_probes_out: unanswered 0 window probes
* @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options)
* @icsk_ack: Delayed ACK control data
* @icsk_mtup; MTU probing control data
*/
struct inet_connection_sock {
/* inet_sock has to be the first member! */
Expand Down Expand Up @@ -104,6 +105,18 @@ struct inet_connection_sock {
__u16 last_seg_size; /* Size of last incoming segment */
__u16 rcv_mss; /* MSS used for delayed ACK decisions */
} icsk_ack;
struct {
int enabled;

/* Range of MTUs to search */
int search_high;
int search_low;

/* Information on the current probe. */
int probe_size;
__u32 probe_seq_start;
__u32 probe_seq_end;
} icsk_mtup;
u32 icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32))
};
Expand Down
9 changes: 9 additions & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Minimal RCV_MSS. */
#define TCP_MIN_RCVMSS 536U

/* The least MTU to use for probing */
#define TCP_BASE_MSS 512

/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3

Expand Down Expand Up @@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
extern int sysctl_tcp_abc;
extern int sysctl_tcp_mtu_probing;
extern int sysctl_tcp_base_mss;

extern atomic_t tcp_memory_allocated;
extern atomic_t tcp_sockets_allocated;
Expand Down Expand Up @@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,

extern void tcp_initialize_rcv_mss(struct sock *sk);

extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
extern int tcp_mss_to_mtu(struct sock *sk, int mss);
extern void tcp_mtup_init(struct sock *sk);

static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
Expand Down
16 changes: 16 additions & 0 deletions net/ipv4/sysctl_net_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,22 @@ ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = NET_TCP_MTU_PROBING,
.procname = "tcp_mtu_probing",
.data = &sysctl_tcp_mtu_probing,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = NET_TCP_BASE_MSS,
.procname = "tcp_base_mss",
.data = &sysctl_tcp_base_mss,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},

{ .ctl_name = 0 }
};
Expand Down
49 changes: 49 additions & 0 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -1891,6 +1891,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
}
}

static void tcp_mtup_probe_failed(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);

icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
}

static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);

/* FIXME: breaks with very large cwnd */
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = tp->snd_cwnd *
tcp_mss_to_mtu(sk, tp->mss_cache) /
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->rcv_ssthresh = tcp_current_ssthresh(sk);

icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}


/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
Expand Down Expand Up @@ -2023,6 +2051,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
return;
}

/* MTU probe failure: don't reduce cwnd */
if (icsk->icsk_ca_state < TCP_CA_CWR &&
icsk->icsk_mtup.probe_size &&
tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
tp->snd_cwnd++;
tcp_simple_retransmit(sk);
return;
}

/* Otherwise enter Recovery state */

if (IsReno(tp))
Expand Down Expand Up @@ -2243,6 +2282,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
tp->retrans_stamp = 0;
}

/* MTU probing checks */
if (icsk->icsk_mtup.probe_size) {
if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
tcp_mtup_probe_success(sk, skb);
}
}

if (sacked) {
if (sacked & TCPCB_RETRANS) {
if(sacked & TCPCB_SACKED_RETRANS)
Expand Down Expand Up @@ -4101,6 +4147,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
tp->rx_opt.sack_ok |= 2;

tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);

Expand Down Expand Up @@ -4211,6 +4258,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->ecn_flags&TCP_ECN_OK)
sock_set_flag(sk, SOCK_NO_LARGESEND);

tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);

Expand Down Expand Up @@ -4399,6 +4447,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
tp->lsndtime = tcp_time_stamp;

tcp_mtup_init(sk);
tcp_initialize_rcv_mss(sk);
tcp_init_buffer_space(sk);
tcp_fast_path_on(tp);
Expand Down
1 change: 1 addition & 0 deletions net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -900,6 +900,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
newinet->id = newtp->write_seq ^ jiffies;

tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(newsk);
Expand Down
Loading

0 comments on commit 5d424d5

Please sign in to comment.