Skip to content

Commit

Permalink
tcp/dccp: get rid of central timewait timer
Browse files Browse the repository at this point in the history
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.

This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)

We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.

Tested:

On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)

Before patch :

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171

While test is running, we can observe 25 or even 33 ms latencies.

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2

After patch :

About 90% increase of throughput :

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992

And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eric Dumazet authored and David S. Miller committed Apr 13, 2015
1 parent 20a1d16 commit 789f558
Show file tree
Hide file tree
Showing 11 changed files with 69 additions and 386 deletions.
107 changes: 9 additions & 98 deletions include/net/inet_timewait_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,67 +31,14 @@

struct inet_hashinfo;

#define INET_TWDR_RECYCLE_SLOTS_LOG 5
#define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG)

/*
* If time > 4sec, it is "slow" path, no recycling is required,
* so that we select tick to get range about 4 seconds.
*/
#if HZ <= 16 || HZ > 4096
# error Unsupported: HZ <= 16 or HZ > 4096
#elif HZ <= 32
# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 64
# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 128
# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 256
# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 512
# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 1024
# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 2048
# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#else
# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#endif

static inline u32 inet_tw_time_stamp(void)
{
return jiffies;
}

/* TIME_WAIT reaping mechanism. */
#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */

#define INET_TWDR_TWKILL_QUOTA 100

struct inet_timewait_death_row {
/* Short-time timewait calendar */
int twcal_hand;
unsigned long twcal_jiffie;
struct timer_list twcal_timer;
struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS];

spinlock_t death_lock;
int tw_count;
int period;
u32 thread_slots;
struct work_struct twkill_work;
struct timer_list tw_timer;
int slot;
struct hlist_head cells[INET_TWDR_TWKILL_SLOTS];
struct inet_hashinfo *hashinfo;
atomic_t tw_count;

struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp;
int sysctl_tw_recycle;
int sysctl_max_tw_buckets;
};

void inet_twdr_hangman(unsigned long data);
void inet_twdr_twkill_work(struct work_struct *work);
void inet_twdr_twcal_tick(unsigned long data);

struct inet_bind_bucket;

/*
Expand Down Expand Up @@ -133,52 +80,18 @@ struct inet_timewait_sock {
__be16 tw_sport;
kmemcheck_bitfield_begin(flags);
/* And these are ours. */
unsigned int tw_pad0 : 1, /* 1 bit hole */
unsigned int tw_kill : 1,
tw_transparent : 1,
tw_flowlabel : 20,
tw_pad : 2, /* 2 bits hole */
tw_tos : 8;
kmemcheck_bitfield_end(flags);
u32 tw_ttd;
struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb;
struct hlist_node tw_death_node;
struct inet_timewait_death_row *tw_dr;
};
#define tw_tclass tw_tos

static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
{
return !hlist_unhashed(&tw->tw_death_node);
}

static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
{
tw->tw_death_node.pprev = NULL;
}

static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
{
__hlist_del(&tw->tw_death_node);
inet_twsk_dead_node_init(tw);
}

static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
{
if (inet_twsk_dead_hashed(tw)) {
__inet_twsk_del_dead_node(tw);
return 1;
}
return 0;
}

#define inet_twsk_for_each(tw, node, head) \
hlist_nulls_for_each_entry(tw, node, head, tw_node)

#define inet_twsk_for_each_inmate(tw, jail) \
hlist_for_each_entry(tw, jail, tw_death_node)

#define inet_twsk_for_each_inmate_safe(tw, safe, jail) \
hlist_for_each_entry_safe(tw, safe, jail, tw_death_node)

static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
{
return (struct inet_timewait_sock *)sk;
Expand All @@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo);

struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
struct inet_timewait_death_row *dr,
const int state);

void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
struct inet_hashinfo *hashinfo);

void inet_twsk_schedule(struct inet_timewait_sock *tw,
struct inet_timewait_death_row *twdr,
const int timeo, const int timewait_len);
void inet_twsk_deschedule(struct inet_timewait_sock *tw,
struct inet_timewait_death_row *twdr);
void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo);
void inet_twsk_deschedule(struct inet_timewait_sock *tw);

void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family);
Expand Down
19 changes: 3 additions & 16 deletions net/dccp/minisocks.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,16 @@

struct inet_timewait_death_row dccp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2,
.period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
.death_lock = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
.hashinfo = &dccp_hashinfo,
.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
(unsigned long)&dccp_death_row),
.twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work,
inet_twdr_twkill_work),
/* Short-time timewait calendar */

.twcal_hand = -1,
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&dccp_death_row),
};

EXPORT_SYMBOL_GPL(dccp_death_row);

void dccp_time_wait(struct sock *sk, int state, int timeo)
{
struct inet_timewait_sock *tw = NULL;
struct inet_timewait_sock *tw;

if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
tw = inet_twsk_alloc(sk, state);
tw = inet_twsk_alloc(sk, &dccp_death_row, state);

if (tw != NULL) {
const struct inet_connection_sock *icsk = inet_csk(sk);
Expand All @@ -71,8 +59,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
if (state == DCCP_TIME_WAIT)
timeo = DCCP_TIMEWAIT_LEN;

inet_twsk_schedule(tw, &dccp_death_row, timeo,
DCCP_TIMEWAIT_LEN);
inet_twsk_schedule(tw, timeo);
inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
Expand Down
4 changes: 2 additions & 2 deletions net/ipv4/inet_diag.c
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
struct inet_timewait_sock *tw = inet_twsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
s32 tmo;
long tmo;

nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
nlmsg_flags);
Expand All @@ -258,7 +258,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
r = nlmsg_data(nlh);
BUG_ON(tw->tw_state != TCP_TIME_WAIT);

tmo = tw->tw_ttd - inet_tw_time_stamp();
tmo = tw->tw_timer.expires - jiffies;
if (tmo < 0)
tmo = 0;

Expand Down
4 changes: 2 additions & 2 deletions net/ipv4/inet_hashtables.c
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
*twp = tw;
} else if (tw) {
/* Silly. Should hash-dance instead... */
inet_twsk_deschedule(tw, death_row);
inet_twsk_deschedule(tw);

inet_twsk_put(tw);
}
Expand Down Expand Up @@ -565,7 +565,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
spin_unlock(&head->lock);

if (tw) {
inet_twsk_deschedule(tw, death_row);
inet_twsk_deschedule(tw);
while (twrefcnt) {
twrefcnt--;
inet_twsk_put(tw);
Expand Down
Loading

0 comments on commit 789f558

Please sign in to comment.