Skip to content

Commit

Permalink
Merge branch 'tcp-improving-RACK-cpu-performance'
Browse files Browse the repository at this point in the history
Yuchung Cheng says:

====================
tcp: improving RACK cpu performance

This patch set improves the CPU consumption of the RACK TCP loss
recovery algorithm, in particular for high-speed networks. Currently,
for every ACK in recovery RACK can potentially iterate over all sent
packets in the write queue. On large BDP networks with non-trivial
losses the RACK write queue walk CPU usage becomes unreasonably high.

This patch introduces a new queue in TCP that keeps only skbs sent and
not yet (s)acked or marked lost, in time order instead of sequence
order.  With that, RACK can examine this time-sorted list and only
check packets that were sent recently, within the reordering window,
per ACK. This is the fastest way without any write queue walks. The
number of skbs examined per ACK is reduced by orders of magnitude.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Oct 6, 2017
2 parents b1fb67f + bef0622 commit cec451c
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 49 deletions.
11 changes: 9 additions & 2 deletions include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t;
* @nf_trace: netfilter packet trace flag
* @protocol: Packet protocol from driver
* @destructor: Destruct function
* @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
* @_nfct: Associated connection, if any (with nfctinfo bits)
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
* @skb_iif: ifindex of device we arrived on
Expand Down Expand Up @@ -686,8 +687,14 @@ struct sk_buff {
*/
char cb[48] __aligned(8);

unsigned long _skb_refdst;
void (*destructor)(struct sk_buff *skb);
union {
struct {
unsigned long _skb_refdst;
void (*destructor)(struct sk_buff *skb);
};
struct list_head tcp_tsorted_anchor;
};

#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
Expand Down
1 change: 1 addition & 0 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ struct tcp_sock {
u32 tsoffset; /* timestamp offset */

struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */

u32 snd_wl1; /* Sequence for window update */
u32 snd_wnd; /* The window we expect to receive */
Expand Down
24 changes: 23 additions & 1 deletion include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1589,14 +1589,34 @@ enum tcp_chrono {
void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);

/* This helper is needed, because skb->tcp_tsorted_anchor uses
* the same memory storage than skb->destructor/_skb_refdst
*/
static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
{
skb->destructor = NULL;
skb->_skb_refdst = 0UL;
}

#define tcp_skb_tsorted_save(skb) { \
unsigned long _save = skb->_skb_refdst; \
skb->_skb_refdst = 0UL;

#define tcp_skb_tsorted_restore(skb) \
skb->_skb_refdst = _save; \
}

/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
{
struct sk_buff *skb;

tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
tcp_skb_tsorted_anchor_cleanup(skb);
sk_wmem_free_skb(sk, skb);
}
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
}
Expand Down Expand Up @@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,

static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
list_del(&skb->tcp_tsorted_anchor);
tcp_skb_tsorted_anchor_cleanup(skb);
__skb_unlink(skb, &sk->sk_write_queue);
}

Expand Down
2 changes: 2 additions & 0 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk)
tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
INIT_LIST_HEAD(&tp->tsq_node);
INIT_LIST_HEAD(&tp->tsorted_sent_queue);

icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
Expand Down Expand Up @@ -881,6 +882,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
* available to the caller, no more, no less.
*/
skb->reserved_tailroom = skb->end - skb->tail - size;
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
return skb;
}
__kfree_skb(skb);
Expand Down
9 changes: 7 additions & 2 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
tcp_skb_pcount(skb),
skb->skb_mstamp);
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
list_del_init(&skb->tcp_tsorted_anchor);

if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
Expand Down Expand Up @@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,

shinfo = skb_shinfo(skb);
if (!before(shinfo->tskey, prior_snd_una) &&
before(shinfo->tskey, tcp_sk(sk)->snd_una))
__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
tcp_skb_tsorted_save(skb) {
__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
} tcp_skb_tsorted_restore(skb);
}
}

/* Remove acknowledged frames from the retransmission queue. If our packet
Expand Down
1 change: 1 addition & 0 deletions net/ipv4/tcp_minisocks.c
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;

INIT_LIST_HEAD(&newtp->tsq_node);
INIT_LIST_HEAD(&newtp->tsorted_sent_queue);

tcp_init_wl(newtp, treq->rcv_isn);

Expand Down
42 changes: 31 additions & 11 deletions net/ipv4/tcp_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
HRTIMER_MODE_ABS_PINNED);
}

static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
{
skb->skb_mstamp = tp->tcp_mstamp;
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}

/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
Expand Down Expand Up @@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
oskb = skb;
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
skb = skb_clone(skb, gfp_mask);

tcp_skb_tsorted_save(oskb) {
if (unlikely(skb_cloned(oskb)))
skb = pskb_copy(oskb, gfp_mask);
else
skb = skb_clone(oskb, gfp_mask);
} tcp_skb_tsorted_restore(oskb);

if (unlikely(!skb))
return -ENOBUFS;
}
Expand Down Expand Up @@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
err = net_xmit_eval(err);
}
if (!err && oskb) {
oskb->skb_mstamp = tp->tcp_mstamp;
tcp_update_skb_after_send(tp, oskb);
tcp_rate_skb_sent(sk, oskb);
}
return err;
Expand Down Expand Up @@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
/* Link BUFF into the send queue. */
__skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk);
list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);

return 0;
}
Expand Down Expand Up @@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,

if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp" is used as a start point for the retransmit timer */
skb->skb_mstamp = tp->tcp_mstamp;
tcp_update_skb_after_send(tp, skb);
goto repair; /* Skip network transmission */
}

Expand Down Expand Up @@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb;

nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-ENOBUFS;
tcp_skb_tsorted_save(skb) {
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-ENOBUFS;
} tcp_skb_tsorted_restore(skb);

if (!err)
skb->skb_mstamp = tp->tcp_mstamp;
tcp_update_skb_after_send(tp, skb);
} else {
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
Expand Down Expand Up @@ -3023,6 +3037,7 @@ void tcp_send_fin(struct sock *sk)
goto coalesce;
return;
}
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
Expand Down Expand Up @@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk)
}
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
struct sk_buff *nskb;

tcp_skb_tsorted_save(skb) {
nskb = skb_copy(skb, GFP_ATOMIC);
} tcp_skb_tsorted_restore(skb);
if (!nskb)
return -ENOMEM;
INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
tcp_unlink_write_queue(skb, sk);
__skb_header_release(nskb);
__tcp_add_write_queue_head(sk, nskb);
Expand Down
52 changes: 19 additions & 33 deletions net/ipv4/tcp_recovery.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *skb, *n;
u32 reo_wnd;

*reo_timeout = 0;
Expand All @@ -58,45 +58,31 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);

tcp_for_write_queue(skb, sk) {
list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
tcp_tsorted_anchor) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
s32 remaining;

if (skb == tcp_send_head(sk))
break;

/* Skip ones already (s)acked */
if (!after(scb->end_seq, tp->snd_una) ||
scb->sacked & TCPCB_SACKED_ACKED)
/* Skip ones marked lost but not yet retransmitted */
if ((scb->sacked & TCPCB_LOST) &&
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;

if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
tp->rack.end_seq, scb->end_seq)) {
/* Step 3 in draft-cheng-tcpm-rack-00.txt:
* A packet is lost if its elapsed time is beyond
* the recent RTT plus the reordering window.
*/
u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
skb->skb_mstamp);
s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;

if (remaining < 0) {
tcp_rack_mark_skb_lost(sk, skb);
continue;
}

/* Skip ones marked lost but not yet retransmitted */
if ((scb->sacked & TCPCB_LOST) &&
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
tp->rack.end_seq, scb->end_seq))
break;

/* A packet is lost if it has not been s/acked beyond
* the recent RTT plus the reordering window.
*/
remaining = tp->rack.rtt_us + reo_wnd -
tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
if (remaining < 0) {
tcp_rack_mark_skb_lost(sk, skb);
list_del_init(&skb->tcp_tsorted_anchor);
} else {
/* Record maximum wait time (+1 to avoid 0) */
*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);

} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent
*/
break;
}
}
}
Expand Down

0 comments on commit cec451c

Please sign in to comment.