Skip to content

Commit

Permalink
Merge branch 'tcp_stretch_acks'
Browse files Browse the repository at this point in the history
Neal Cardwell says:

====================
fix stretch ACK bugs in TCP CUBIC and Reno

This patch series fixes the TCP CUBIC and Reno congestion control
modules to properly handle stretch ACKs in their respective additive
increase modes, and in the transitions from slow start to additive
increase.

This finishes the project started by commit 9f9843a ("tcp:
properly handle stretch acks in slow start"), which fixed behavior for
TCP congestion control when handling stretch ACKs in slow start mode.

Motivation: In the Jan 2015 netdev thread 'BW regression after "tcp:
refine TSO autosizing"', Eyal Perry documented a regression that Eric
Dumazet determined was caused by improper handling of TCP stretch
ACKs.

Background: LRO, GRO, delayed ACKs, and middleboxes can cause "stretch
ACKs" that cover more than the RFC-specified maximum of 2
packets. These stretch ACKs can cause serious performance shortfalls
in common congestion control algorithms, like Reno and CUBIC, which
were designed and tuned years ago with receiver hosts that were not
using LRO or GRO, and were instead ACKing every other packet.

Testing: at Google we have been using this approach for handling
stretch ACKs for CUBIC datacenter and Internet traffic for several
years, with good results.

v2:
 * fixed return type of tcp_slow_start() to be u32 instead of int
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Jan 29, 2015
2 parents 59343cd + d6b1a8a commit 95224ac
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 40 deletions.
4 changes: 2 additions & 2 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -834,8 +834,8 @@ void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
int tcp_set_congestion_control(struct sock *sk, const char *name);
void tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);

u32 tcp_reno_ssthresh(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/tcp_bic.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
tcp_cong_avoid_ai(tp, ca->cnt, 1);
}
}

Expand Down
32 changes: 20 additions & 12 deletions net/ipv4/tcp_cong.c
Original file line number Diff line number Diff line change
Expand Up @@ -291,26 +291,32 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
* ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
* returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
void tcp_slow_start(struct tcp_sock *tp, u32 acked)
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
u32 cwnd = tp->snd_cwnd + acked;

if (cwnd > tp->snd_ssthresh)
cwnd = tp->snd_ssthresh + 1;
acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);

return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);

/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
* for every packet that was ACKed.
*/
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
{
tp->snd_cwnd_cnt += acked;
if (tp->snd_cwnd_cnt >= w) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
tp->snd_cwnd_cnt = 0;
} else {
tp->snd_cwnd_cnt++;
u32 delta = tp->snd_cwnd_cnt / w;

tp->snd_cwnd_cnt -= delta * w;
tp->snd_cwnd += delta;
}
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
}
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);

Expand All @@ -329,11 +335,13 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;

/* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp, acked);
if (tp->snd_cwnd <= tp->snd_ssthresh) {
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
}
/* In dangerous area, increase slowly. */
else
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);

Expand Down
39 changes: 17 additions & 22 deletions net/ipv4/tcp_cubic.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,7 @@ struct bictcp {
u32 epoch_start; /* beginning of an epoch */
u32 ack_cnt; /* number of acks */
u32 tcp_cwnd; /* estimated tcp cwnd */
#define ACK_RATIO_SHIFT 4
#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
u16 unused;
u8 sample_cnt; /* number of samples to decide curr_rtt */
u8 found; /* the exit point is found? */
u32 round_start; /* beginning of each round */
Expand All @@ -114,7 +112,6 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->bic_K = 0;
ca->delay_min = 0;
ca->epoch_start = 0;
ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
ca->ack_cnt = 0;
ca->tcp_cwnd = 0;
ca->found = 0;
Expand Down Expand Up @@ -205,23 +202,30 @@ static u32 cubic_root(u64 a)
/*
* Compute congestion window to use.
*/
static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
{
u32 delta, bic_target, max_cnt;
u64 offs, t;

ca->ack_cnt++; /* count the number of ACKs */
ca->ack_cnt += acked; /* count the number of ACKed packets */

if (ca->last_cwnd == cwnd &&
(s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
return;

/* The CUBIC function can update ca->cnt at most once per jiffy.
* On all cwnd reduction events, ca->epoch_start is set to 0,
* which will force a recalculation of ca->cnt.
*/
if (ca->epoch_start && tcp_time_stamp == ca->last_time)
goto tcp_friendliness;

ca->last_cwnd = cwnd;
ca->last_time = tcp_time_stamp;

if (ca->epoch_start == 0) {
ca->epoch_start = tcp_time_stamp; /* record beginning */
ca->ack_cnt = 1; /* start counting */
ca->ack_cnt = acked; /* start counting */
ca->tcp_cwnd = cwnd; /* syn with cubic */

if (ca->last_max_cwnd <= cwnd) {
Expand Down Expand Up @@ -283,6 +287,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
if (ca->last_max_cwnd == 0 && ca->cnt > 20)
ca->cnt = 20; /* increase cwnd 5% per RTT */

tcp_friendliness:
/* TCP Friendly */
if (tcp_friendliness) {
u32 scale = beta_scale;
Expand All @@ -301,7 +306,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
}
}

ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
if (ca->cnt == 0) /* cannot be zero */
ca->cnt = 1;
}
Expand All @@ -317,11 +321,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (tp->snd_cwnd <= tp->snd_ssthresh) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
tcp_slow_start(tp, acked);
} else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
}
bictcp_update(ca, tp->snd_cwnd, acked);
tcp_cong_avoid_ai(tp, ca->cnt, acked);
}

static u32 bictcp_recalc_ssthresh(struct sock *sk)
Expand Down Expand Up @@ -411,20 +416,10 @@ static void hystart_update(struct sock *sk, u32 delay)
*/
static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
u32 delay;

if (icsk->icsk_ca_state == TCP_CA_Open) {
u32 ratio = ca->delayed_ack;

ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ratio += cnt;

ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
}

/* Some calls are for duplicates without timetamps */
if (rtt_us < 0)
return;
Expand Down
3 changes: 2 additions & 1 deletion net/ipv4/tcp_scalable.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp, acked);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
1);
}

static u32 tcp_scalable_ssthresh(struct sock *sk)
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/tcp_veno.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* In the "non-congestive state", increase cwnd
* every rtt.
*/
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
} else {
/* In the "congestive state", increase cwnd
* every other rtt.
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/tcp_yeah.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)

} else {
/* Reno */
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
}

/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
Expand Down

0 comments on commit 95224ac

Please sign in to comment.