Skip to content

Commit

Permalink
Merge branch 'tcp-implement-SACK-compression'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: implement SACK compression

When TCP receives an out-of-order packet, it immediately sends
a SACK packet, generating network load but also forcing the
receiver to send 1-MSS pathological packets, increasing its
RTX queue length/depth, and thus processing time.

Wifi networks suffer from this aggressive behavior, but generally
speaking, all these SACK packets add fuel to the fire when networks
are under congestion.

This patch series adds SACK compression, but the infrastructure
could be leveraged to also compress ACK in the future.

v2: Addressed Neal feedback.
    Added two sysctls to allow fine tuning, or even disabling the feature.

v3: take rtt = min(srtt, rcv_rtt) as Yuchung suggested, because rcv_rtt
    can be over estimated for RPC (or sender limited)
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed May 18, 2018
2 parents 64a2658 + 9c21d2f commit 2c47a65
Show file tree
Hide file tree
Showing 12 changed files with 107 additions and 9 deletions.
13 changes: 13 additions & 0 deletions Documentation/networking/ip-sysctl.txt
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,19 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
tcp_sack - BOOLEAN
Enable select acknowledgments (SACKS).

tcp_comp_sack_delay_ns - LONG INTEGER
TCP tries to reduce number of SACK sent, using a timer
based on 5% of SRTT, capped by this sysctl, in nano seconds.
The default is 1ms, based on TSO autosizing period.

Default : 1,000,000 ns (1 ms)

tcp_comp_sack_nr - INTEGER
Max numer of SACK that can be compressed.
Using 0 disables SACK compression.

Detault : 44

tcp_slow_start_after_idle - BOOLEAN
If set, provide RFC2861 behavior and time out the congestion
window after an idle period. An idle period is defined at
Expand Down
2 changes: 2 additions & 0 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ struct tcp_sock {
reord:1; /* reordering detected */
} rack;
u16 advmss; /* Advertised MSS */
u8 compressed_ack;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */
Expand Down Expand Up @@ -297,6 +298,7 @@ struct tcp_sock {
u32 sacked_out; /* SACK'd packets */

struct hrtimer pacing_timer;
struct hrtimer compressed_ack_timer;

/* from STCP, retrans queue hinting */
struct sk_buff* lost_skb_hint;
Expand Down
2 changes: 2 additions & 0 deletions include/net/netns/ipv4.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ struct netns_ipv4 {
int sysctl_tcp_pacing_ca_ratio;
int sysctl_tcp_wmem[3];
int sysctl_tcp_rmem[3];
int sysctl_tcp_comp_sack_nr;
unsigned long sysctl_tcp_comp_sack_delay_ns;
struct inet_timewait_death_row tcp_death_row;
int sysctl_max_syn_backlog;
int sysctl_tcp_fastopen;
Expand Down
5 changes: 4 additions & 1 deletion include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,10 @@ void tcp_init_xmit_timers(struct sock *);
static inline void tcp_clear_xmit_timers(struct sock *sk)
{
if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
sock_put(sk);
__sock_put(sk);

if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
__sock_put(sk);

inet_csk_clear_xmit_timers(sk);
}
Expand Down
1 change: 1 addition & 0 deletions include/uapi/linux/snmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ enum
LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */
LINUX_MIB_TCPDELIVERED, /* TCPDelivered */
LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */
LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */
__LINUX_MIB_MAX
};

Expand Down
1 change: 1 addition & 0 deletions net/ipv4/proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
SNMP_MIB_SENTINEL
};

Expand Down
17 changes: 17 additions & 0 deletions net/ipv4/sysctl_net_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1;
static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
static int comp_sack_nr_max = 255;

/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
Expand Down Expand Up @@ -1151,6 +1152,22 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
},
{
.procname = "tcp_comp_sack_delay_ns",
.data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "tcp_comp_sack_nr",
.data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &comp_sack_nr_max,
},
{
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
Expand Down
1 change: 1 addition & 0 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags)
dst_release(sk->sk_rx_dst);
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;

/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
Expand Down
38 changes: 30 additions & 8 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -4249,6 +4249,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
* If the sack array is full, forget about the last one.
*/
if (this_sack >= TCP_NUM_SACKS) {
if (tp->compressed_ack)
tcp_send_ack(sk);
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
Expand Down Expand Up @@ -4715,8 +4717,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;

tcp_enter_quickack_mode(sk);

if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
Expand Down Expand Up @@ -5083,6 +5083,7 @@ static inline void tcp_data_snd_check(struct sock *sk)
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned long rtt, delay;

/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
Expand All @@ -5094,15 +5095,36 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
__tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
(ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
/* Then ack it now */
tcp_in_quickack_mode(sk)) {
send_now:
tcp_send_ack(sk);
} else {
/* Else, send delayed ack. */
return;
}

if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_send_delayed_ack(sk);
return;
}

if (!tcp_is_sack(tp) ||
tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
goto send_now;
tp->compressed_ack++;

if (hrtimer_is_queued(&tp->compressed_ack_timer))
return;

/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */

rtt = tp->rcv_rtt_est.rtt_us;
if (tp->srtt_us && tp->srtt_us < rtt)
rtt = tp->srtt_us;

delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
rtt * (NSEC_PER_USEC >> 3)/20);
sock_hold(sk);
hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_SOFT);
}

static inline void tcp_ack_snd_check(struct sock *sk)
Expand Down
2 changes: 2 additions & 0 deletions net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -2572,6 +2572,8 @@ static int __net_init tcp_sk_init(struct net *net)
init_net.ipv4.sysctl_tcp_wmem,
sizeof(init_net.ipv4.sysctl_tcp_wmem));
}
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
Expand Down
9 changes: 9 additions & 0 deletions net/ipv4/tcp_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,15 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
/* Account for an ACK we sent. */
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
struct tcp_sock *tp = tcp_sk(sk);

if (unlikely(tp->compressed_ack)) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
tp->compressed_ack);
tp->compressed_ack = 0;
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk);
}
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
Expand Down
25 changes: 25 additions & 0 deletions net/ipv4/tcp_timer.c
Original file line number Diff line number Diff line change
Expand Up @@ -708,11 +708,36 @@ static void tcp_keepalive_timer (struct timer_list *t)
sock_put(sk);
}

static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
{
struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
struct sock *sk = (struct sock *)tp;

bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
if (tp->compressed_ack)
tcp_send_ack(sk);
} else {
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
&sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);

sock_put(sk);

return HRTIMER_NORESTART;
}

void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED_SOFT);
tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;

hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED_SOFT);
tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
}

0 comments on commit 2c47a65

Please sign in to comment.