Skip to content

Commit

Permalink
Merge branch 'tcp_cubic-various-fixes'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp_cubic: various fixes

This patch series converts tcp_cubic to usec clock resolution
for Hystart logic.

This makes Hystart more relevant for data-center flows.
Prior to this series, Hystart was not kicking, or was
kicking without good reason, since the 1ms clock was too coarse.

Last patch also fixes an issue with Hystart vs TCP pacing.

v2: removed a last-minute debug chunk from last patch
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Dec 28, 2019
2 parents 2bbc078 + ede656e commit 36a7886
Showing 1 changed file with 51 additions and 31 deletions.
82 changes: 51 additions & 31 deletions net/ipv4/tcp_cubic.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@

/* Number of delay samples for detecting the increase of delay */
#define HYSTART_MIN_SAMPLES 8
#define HYSTART_DELAY_MIN (4U<<3)
#define HYSTART_DELAY_MAX (16U<<3)
#define HYSTART_DELAY_MIN (4000U) /* 4 ms */
#define HYSTART_DELAY_MAX (16000U) /* 16 ms */
#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)

static int fast_convergence __read_mostly = 1;
Expand All @@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1;
static int hystart __read_mostly = 1;
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
static int hystart_low_window __read_mostly = 16;
static int hystart_ack_delta __read_mostly = 2;
static int hystart_ack_delta_us __read_mostly = 2000;

static u32 cube_rtt_scale __read_mostly;
static u32 beta_scale __read_mostly;
Expand All @@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
" 1: packet-train 2: delay 3: both packet-train and delay");
module_param(hystart_low_window, int, 0644);
MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
module_param(hystart_ack_delta, int, 0644);
MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
module_param(hystart_ack_delta_us, int, 0644);
MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");

/* BIC TCP Parameters */
struct bictcp {
Expand All @@ -89,7 +89,7 @@ struct bictcp {
u32 bic_origin_point;/* origin point of bic function */
u32 bic_K; /* time to origin point
from the beginning of the current epoch */
u32 delay_min; /* min delay (msec << 3) */
u32 delay_min; /* min delay (usec) */
u32 epoch_start; /* beginning of an epoch */
u32 ack_cnt; /* number of acks */
u32 tcp_cwnd; /* estimated tcp cwnd */
Expand Down Expand Up @@ -117,23 +117,19 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->found = 0;
}

static inline u32 bictcp_clock(void)
static inline u32 bictcp_clock_us(const struct sock *sk)
{
#if HZ < 1000
return ktime_to_ms(ktime_get_real());
#else
return jiffies_to_msecs(jiffies);
#endif
return tcp_sk(sk)->tcp_mstamp;
}

static inline void bictcp_hystart_reset(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);

ca->round_start = ca->last_ack = bictcp_clock();
ca->round_start = ca->last_ack = bictcp_clock_us(sk);
ca->end_seq = tp->snd_nxt;
ca->curr_rtt = 0;
ca->curr_rtt = ~0U;
ca->sample_cnt = 0;
}

Expand Down Expand Up @@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
*/

t = (s32)(tcp_jiffies32 - ca->epoch_start);
t += msecs_to_jiffies(ca->delay_min >> 3);
t += usecs_to_jiffies(ca->delay_min);
/* change the unit from HZ to bictcp_HZ */
t <<= BICTCP_HZ;
do_div(t, HZ);
Expand Down Expand Up @@ -380,18 +376,26 @@ static void hystart_update(struct sock *sk, u32 delay)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);

if (ca->found & hystart_detect)
return;
u32 threshold;

if (hystart_detect & HYSTART_ACK_TRAIN) {
u32 now = bictcp_clock();
u32 now = bictcp_clock_us(sk);

/* first detection parameter - ack-train detection */
if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
ca->last_ack = now;
if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
ca->found |= HYSTART_ACK_TRAIN;

threshold = ca->delay_min;
/* Hystart ack train triggers if we get ack past
* ca->delay_min/2.
* Pacing might have delayed packets up to RTT/2
* during slow start.
*/
if (sk->sk_pacing_status == SK_PACING_NONE)
threshold >>= 1;

if ((s32)(now - ca->round_start) > threshold) {
ca->found = 1;
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
NET_ADD_STATS(sock_net(sk),
Expand All @@ -405,14 +409,14 @@ static void hystart_update(struct sock *sk, u32 delay)
if (hystart_detect & HYSTART_DELAY) {
/* obtain the minimum delay of more than sampling packets */
if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
if (ca->curr_rtt > delay)
ca->curr_rtt = delay;

ca->sample_cnt++;
} else {
if (ca->curr_rtt > ca->delay_min +
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
ca->found |= HYSTART_DELAY;
ca->found = 1;
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
NET_ADD_STATS(sock_net(sk),
Expand All @@ -424,9 +428,6 @@ static void hystart_update(struct sock *sk, u32 delay)
}
}

/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
Expand All @@ -441,16 +442,35 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
return;

delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
delay = sample->rtt_us;
if (delay == 0)
delay = 1;

/* first time call or link delay decreases */
if (ca->delay_min == 0 || ca->delay_min > delay)
ca->delay_min = delay;
if (ca->delay_min == 0 || ca->delay_min > delay) {
unsigned long rate = READ_ONCE(sk->sk_pacing_rate);

/* Account for TSO/GRO delays.
* Otherwise short RTT flows could get too small ssthresh,
* since during slow start we begin with small TSO packets
* and could lower ca->delay_min too much.
* Ideally even with a very small RTT we would like to have
* at least one TSO packet being sent and received by GRO,
* and another one in qdisc layer.
* We apply another 100% factor because @rate is doubled at
* this point.
* We cap the cushion to 1ms.
*/
if (rate)
delay += min_t(u64, USEC_PER_MSEC,
div64_ul((u64)GSO_MAX_SIZE *
4 * USEC_PER_SEC, rate));
if (ca->delay_min == 0 || ca->delay_min > delay)
ca->delay_min = delay;
}

/* hystart triggers when cwnd is larger than some threshold */
if (hystart && tcp_in_slow_start(tp) &&
if (!ca->found && hystart && tcp_in_slow_start(tp) &&
tp->snd_cwnd >= hystart_low_window)
hystart_update(sk, delay);
}
Expand Down

0 comments on commit 36a7886

Please sign in to comment.