Skip to content

Commit

Permalink
Merge branch 'tcp-rbtree-retransmit-queue'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: implement rb-tree based retransmit queue

This patch series implement RB-tree based retransmit queue for TCP,
to better match modern BDP.

Tested:

 On receiver :
 netem on ingress : delay 150ms 200us loss 1
 GRO disabled to force stress and SACK storms.

for f in `seq 1 10`
do
 ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1
done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}'

Before patch :

323.87  351.48  339.59  338.62  306.72
204.07  304.93  291.88  202.47  176.88
->   2840

After patch:

1700.83 2207.98 2070.17 1544.26 2114.76
2124.89 1693.14 1080.91 2216.82 1299.94
->  18053

Average of 1805 Mbits istead of 284 Mbits.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Oct 6, 2017
2 parents f5333f8 + 75c119a commit ca82214
Show file tree
Hide file tree
Showing 10 changed files with 311 additions and 249 deletions.
18 changes: 18 additions & 0 deletions include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
return __skb_grow(skb, len);
}

#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root) rb_to_skb(rb_last(root))
#define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode))

#define skb_queue_walk(queue, skb) \
for (skb = (queue)->next; \
skb != (struct sk_buff *)(queue); \
Expand All @@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
for (; skb != (struct sk_buff *)(queue); \
skb = skb->next)

#define skb_rbtree_walk(skb, root) \
for (skb = skb_rb_first(root); skb != NULL; \
skb = skb_rb_next(skb))

#define skb_rbtree_walk_from(skb) \
for (; skb != NULL; \
skb = skb_rb_next(skb))

#define skb_rbtree_walk_from_safe(skb, tmp) \
for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \
skb = tmp)

#define skb_queue_walk_from_safe(queue, skb, tmp) \
for (tmp = skb->next; \
skb != (struct sk_buff *)(queue); \
Expand Down
7 changes: 5 additions & 2 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cgroup-defs.h>

#include <linux/rbtree.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
Expand Down Expand Up @@ -397,7 +397,10 @@ struct sock {
int sk_wmem_queued;
refcount_t sk_wmem_alloc;
unsigned long sk_tsq_flags;
struct sk_buff *sk_send_head;
union {
struct sk_buff *sk_send_head;
struct rb_root tcp_rtx_queue;
};
struct sk_buff_head sk_write_queue;
__s32 sk_peek_off;
int sk_write_pending;
Expand Down
100 changes: 46 additions & 54 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
enum tcp_queue {
TCP_FRAG_IN_WRITE_QUEUE,
TCP_FRAG_IN_RTX_QUEUE,
};
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
struct sk_buff *skb, u32 len,
unsigned int mss_now, gfp_t gfp);

void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *);
Expand Down Expand Up @@ -1606,19 +1612,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
skb->_skb_refdst = _save; \
}

/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
{
struct sk_buff *skb;
void tcp_write_queue_purge(struct sock *sk);

tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
tcp_skb_tsorted_anchor_cleanup(skb);
sk_wmem_free_skb(sk, skb);
}
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
{
return skb_rb_first(&sk->tcp_rtx_queue);
}

static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
Expand All @@ -1643,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
return skb_queue_prev(&sk->sk_write_queue, skb);
}

#define tcp_for_write_queue(skb, sk) \
skb_queue_walk(&(sk)->sk_write_queue, skb)

#define tcp_for_write_queue_from(skb, sk) \
skb_queue_walk_from(&(sk)->sk_write_queue, skb)

#define tcp_for_write_queue_from_safe(skb, tmp, sk) \
skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)

static inline struct sk_buff *tcp_send_head(const struct sock *sk)
{
return sk->sk_send_head;
return skb_peek(&sk->sk_write_queue);
}

static inline bool tcp_skb_is_last(const struct sock *sk,
Expand All @@ -1663,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
return skb_queue_is_last(&sk->sk_write_queue, skb);
}

static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
static inline bool tcp_write_queue_empty(const struct sock *sk)
{
if (tcp_skb_is_last(sk, skb))
sk->sk_send_head = NULL;
else
sk->sk_send_head = tcp_write_queue_next(sk, skb);
return skb_queue_empty(&sk->sk_write_queue);
}

static inline bool tcp_rtx_queue_empty(const struct sock *sk)
{
return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
}

static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
{
return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
}

static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
{
if (sk->sk_send_head == skb_unlinked) {
sk->sk_send_head = NULL;
if (tcp_write_queue_empty(sk))
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
}

if (tcp_sk(sk)->highest_sack == skb_unlinked)
tcp_sk(sk)->highest_sack = NULL;
}

static inline void tcp_init_send_head(struct sock *sk)
{
sk->sk_send_head = NULL;
}

static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
__skb_queue_tail(&sk->sk_write_queue, skb);
Expand All @@ -1696,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
__tcp_add_write_queue_tail(sk, skb);

/* Queue it, remembering where we must start sending. */
if (sk->sk_send_head == NULL) {
sk->sk_send_head = skb;
if (sk->sk_write_queue.next == skb) {
tcp_chrono_start(sk, TCP_CHRONO_BUSY);

if (tcp_sk(sk)->highest_sack == NULL)
Expand All @@ -1710,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
__skb_queue_head(&sk->sk_write_queue, skb);
}

/* Insert buff after skb on the write queue of sk. */
static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
struct sk_buff *buff,
struct sock *sk)
{
__skb_queue_after(&sk->sk_write_queue, skb, buff);
}

/* Insert new before skb on the write queue of sk. */
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
struct sk_buff *skb,
struct sock *sk)
{
__skb_queue_before(&sk->sk_write_queue, skb, new);

if (sk->sk_send_head == skb)
sk->sk_send_head = new;
}

static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
list_del(&skb->tcp_tsorted_anchor);
tcp_skb_tsorted_anchor_cleanup(skb);
__skb_unlink(skb, &sk->sk_write_queue);
}

static inline bool tcp_write_queue_empty(struct sock *sk)
void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);

static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
{
return skb_queue_empty(&sk->sk_write_queue);
tcp_skb_tsorted_anchor_cleanup(skb);
rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
}

static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
{
list_del(&skb->tcp_tsorted_anchor);
tcp_rtx_queue_unlink(skb, sk);
sk_wmem_free_skb(sk, skb);
}

static inline void tcp_push_pending_frames(struct sock *sk)
Expand Down Expand Up @@ -1767,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)

static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
{
tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
tcp_write_queue_next(sk, skb);
struct sk_buff *next = skb_rb_next(skb);

tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
}

static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
Expand All @@ -1778,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)

static inline void tcp_highest_sack_reset(struct sock *sk)
{
tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
struct sk_buff *skb = tcp_rtx_queue_head(sk);

tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
}

/* Called when old skb is about to be deleted (to be combined with new skb) */
Expand Down Expand Up @@ -1948,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
const struct sk_buff *skb = tcp_write_queue_head(sk);
const struct sk_buff *skb = tcp_rtx_queue_head(sk);
u32 rto = inet_csk(sk)->icsk_rto;
u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);

Expand Down
Loading

0 comments on commit ca82214

Please sign in to comment.