Skip to content

Commit

Permalink
tcp: Fix slowness in read /proc/net/tcp
Browse files Browse the repository at this point in the history
This patch address a serious performance issue in reading the
TCP sockets table (/proc/net/tcp).

Reading the full table is done by a number of sequential read
operations.  At each read operation, a seek is done to find the
last socket that was previously read.  This seek operation requires
that the sockets in the table need to be counted up to the current
file position, and to count each of these requires taking a lock for
each non-empty bucket.  The whole algorithm is O(n^2).

The fix is to cache the last bucket value, offset within the bucket,
and the file position returned by the last read operation.   On the
next sequential read, the bucket and offset are used to find the
last read socket immediately without needing ot scan the previous
buckets  the table.  This algorithm t read the whole table is O(n).

The improvement offered by this patch is easily show by performing
cat'ing /proc/net/tcp on a machine with a lot of connections.  With
about 182K connections in the table, I see the following:

- Without patch
time cat /proc/net/tcp > /dev/null

real	1m56.729s
user	0m0.214s
sys	1m56.344s

- With patch
time cat /proc/net/tcp > /dev/null

real	0m0.894s
user	0m0.290s
sys	0m0.594s

Signed-off-by: Tom Herbert <therbert@google.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Tom Herbert authored and David S. Miller committed Jun 7, 2010
1 parent 83038a2 commit a8b690f
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 9 deletions.
3 changes: 2 additions & 1 deletion include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1413,7 +1413,8 @@ struct tcp_iter_state {
sa_family_t family;
enum tcp_seq_states state;
struct sock *syn_wait_sk;
int bucket, sbucket, num, uid;
int bucket, offset, sbucket, num, uid;
loff_t last_pos;
};

extern int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo);
Expand Down
92 changes: 84 additions & 8 deletions net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -1980,6 +1980,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
}

/*
* Get next listener socket follow cur. If cur is NULL, get first socket
* starting from bucket given in st->bucket; when st->bucket is zero the
* very first socket in the hash table is returned.
*/
static void *listening_get_next(struct seq_file *seq, void *cur)
{
struct inet_connection_sock *icsk;
Expand All @@ -1990,14 +1995,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
struct net *net = seq_file_net(seq);

if (!sk) {
st->bucket = 0;
ilb = &tcp_hashinfo.listening_hash[0];
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
sk = sk_nulls_head(&ilb->head);
st->offset = 0;
goto get_sk;
}
ilb = &tcp_hashinfo.listening_hash[st->bucket];
++st->num;
++st->offset;

if (st->state == TCP_SEQ_STATE_OPENREQ) {
struct request_sock *req = cur;
Expand All @@ -2012,6 +2018,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
}
req = req->dl_next;
}
st->offset = 0;
if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break;
get_req:
Expand Down Expand Up @@ -2047,6 +2054,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
spin_unlock_bh(&ilb->lock);
st->offset = 0;
if (++st->bucket < INET_LHTABLE_SIZE) {
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
Expand All @@ -2060,7 +2068,12 @@ static void *listening_get_next(struct seq_file *seq, void *cur)

static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
{
void *rc = listening_get_next(seq, NULL);
struct tcp_iter_state *st = seq->private;
void *rc;

st->bucket = 0;
st->offset = 0;
rc = listening_get_next(seq, NULL);

while (rc && *pos) {
rc = listening_get_next(seq, rc);
Expand All @@ -2075,13 +2088,18 @@ static inline int empty_bucket(struct tcp_iter_state *st)
hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
}

/*
* Get first established socket starting from bucket given in st->bucket.
* If st->bucket is zero, the very first socket in the hash is returned.
*/
static void *established_get_first(struct seq_file *seq)
{
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
void *rc = NULL;

for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
st->offset = 0;
for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
struct inet_timewait_sock *tw;
Expand Down Expand Up @@ -2126,6 +2144,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
struct net *net = seq_file_net(seq);

++st->num;
++st->offset;

if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
tw = cur;
Expand All @@ -2142,6 +2161,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
st->state = TCP_SEQ_STATE_ESTABLISHED;

/* Look for next non empty bucket */
st->offset = 0;
while (++st->bucket <= tcp_hashinfo.ehash_mask &&
empty_bucket(st))
;
Expand Down Expand Up @@ -2169,7 +2189,11 @@ static void *established_get_next(struct seq_file *seq, void *cur)

static void *established_get_idx(struct seq_file *seq, loff_t pos)
{
void *rc = established_get_first(seq);
struct tcp_iter_state *st = seq->private;
void *rc;

st->bucket = 0;
rc = established_get_first(seq);

while (rc && pos) {
rc = established_get_next(seq, rc);
Expand All @@ -2194,31 +2218,81 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
return rc;
}

static void *tcp_seek_last_pos(struct seq_file *seq)
{
struct tcp_iter_state *st = seq->private;
int offset = st->offset;
int orig_num = st->num;
void *rc = NULL;

switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
if (st->bucket >= INET_LHTABLE_SIZE)
break;
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_next(seq, NULL);
while (offset-- && rc)
rc = listening_get_next(seq, rc);
if (rc)
break;
st->bucket = 0;
/* Fallthrough */
case TCP_SEQ_STATE_ESTABLISHED:
case TCP_SEQ_STATE_TIME_WAIT:
st->state = TCP_SEQ_STATE_ESTABLISHED;
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
rc = established_get_first(seq);
while (offset-- && rc)
rc = established_get_next(seq, rc);
}

st->num = orig_num;

return rc;
}

static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
struct tcp_iter_state *st = seq->private;
void *rc;

if (*pos && *pos == st->last_pos) {
rc = tcp_seek_last_pos(seq);
if (rc)
goto out;
}

st->state = TCP_SEQ_STATE_LISTENING;
st->num = 0;
return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
st->bucket = 0;
st->offset = 0;
rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;

out:
st->last_pos = *pos;
return rc;
}

static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct tcp_iter_state *st = seq->private;
void *rc = NULL;
struct tcp_iter_state *st;

if (v == SEQ_START_TOKEN) {
rc = tcp_get_idx(seq, 0);
goto out;
}
st = seq->private;

switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
st->state = TCP_SEQ_STATE_ESTABLISHED;
st->bucket = 0;
st->offset = 0;
rc = established_get_first(seq);
}
break;
Expand All @@ -2229,6 +2303,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
out:
++*pos;
st->last_pos = *pos;
return rc;
}

Expand Down Expand Up @@ -2267,6 +2342,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file)

s = ((struct seq_file *)file->private_data)->private;
s->family = afinfo->family;
s->last_pos = 0;
return 0;
}

Expand Down

0 comments on commit a8b690f

Please sign in to comment.