Skip to content

Commit

Permalink
net: Convert TCP/DCCP listening hash tables to use RCU
Browse files Browse the repository at this point in the history
This is the last step to be able to perform full RCU lookups
in __inet_lookup() : After established/timewait tables, we
add RCU lookups to listening hash table.

The only trick here is that a socket of a given type (TCP ipv4,
TCP ipv6, ...) can now flight between two different tables
(established and listening) during a RCU grace period, so we
must use different 'nulls' end-of-chain values for two tables.

We define a large value :

#define LISTENING_NULLS_BASE (1U << 29)

So that slots in listening table are guaranteed to have different
end-of-chain values than slots in established table. A reader can
still detect it finished its lookup in the right chain.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eric Dumazet authored and David S. Miller committed Nov 24, 2008
1 parent 8c862c2 commit c25eb3b
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 116 deletions.
9 changes: 8 additions & 1 deletion include/net/inet_hashtables.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,16 @@ struct inet_bind_hashbucket {
struct hlist_head chain;
};

/*
* Sockets can be hashed in established or listening table
* We must use different 'nulls' end-of-chain value for listening
* hash table, or we might find a socket that was closed and
* reallocated/inserted into established hash table
*/
#define LISTENING_NULLS_BASE (1U << 29)
struct inet_listen_hashbucket {
spinlock_t lock;
struct hlist_head head;
struct hlist_nulls_head head;
};

/* This is for listening sockets, thus all sockets which possess wildcards. */
Expand Down
4 changes: 2 additions & 2 deletions net/ipv4/inet_diag.c
Original file line number Diff line number Diff line change
Expand Up @@ -720,13 +720,13 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)

for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct sock *sk;
struct hlist_node *node;
struct hlist_nulls_node *node;
struct inet_listen_hashbucket *ilb;

num = 0;
ilb = &hashinfo->listening_hash[i];
spin_lock_bh(&ilb->lock);
sk_for_each(sk, node, &ilb->head) {
sk_nulls_for_each(sk, node, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);

if (num < s_num) {
Expand Down
148 changes: 74 additions & 74 deletions net/ipv4/inet_hashtables.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,78 +110,79 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)

EXPORT_SYMBOL_GPL(__inet_inherit_port);

static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
const int dif)
{
int score = -1;
struct inet_sock *inet = inet_sk(sk);

if (net_eq(sock_net(sk), net) && inet->num == hnum &&
!ipv6_only_sock(sk)) {
__be32 rcv_saddr = inet->rcv_saddr;
score = sk->sk_family == PF_INET ? 1 : 0;
if (rcv_saddr) {
if (rcv_saddr != daddr)
return -1;
score += 2;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
score += 2;
}
}
return score;
}

/*
* Don't inline this cruft. Here are some nice properties to exploit here. The
* BSD API does not allow a listening sock to specify the remote port nor the
* remote address for the connection. So always assume those are both
* wildcarded during the search since they can never be otherwise.
*/
static struct sock *inet_lookup_listener_slow(struct net *net,
const struct hlist_head *head,
const __be32 daddr,
const unsigned short hnum,
const int dif)
{
struct sock *result = NULL, *sk;
const struct hlist_node *node;
int hiscore = -1;

sk_for_each(sk, node, head) {
const struct inet_sock *inet = inet_sk(sk);

if (net_eq(sock_net(sk), net) && inet->num == hnum &&
!ipv6_only_sock(sk)) {
const __be32 rcv_saddr = inet->rcv_saddr;
int score = sk->sk_family == PF_INET ? 1 : 0;

if (rcv_saddr) {
if (rcv_saddr != daddr)
continue;
score += 2;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
continue;
score += 2;
}
if (score == 5)
return sk;
if (score > hiscore) {
hiscore = score;
result = sk;
}
}
}
return result;
}

/* Optimize the common listener case. */

struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
struct sock *sk = NULL;
struct inet_listen_hashbucket *ilb;
struct sock *sk, *result;
struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore;

ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
spin_lock(&ilb->lock);
if (!hlist_empty(&ilb->head)) {
const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head)));

if (inet->num == hnum && !sk->sk_node.next &&
(!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
(sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
!sk->sk_bound_dev_if && net_eq(sock_net(sk), net))
goto sherry_cache;
sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
rcu_read_lock();
begin:
result = NULL;
hiscore = -1;
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
result = sk;
hiscore = score;
}
}
if (sk) {
sherry_cache:
sock_hold(sk);
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
goto begin;
if (result) {
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, daddr,
dif) < hiscore)) {
sock_put(result);
goto begin;
}
}
spin_unlock(&ilb->lock);
return sk;
rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);

Expand Down Expand Up @@ -370,7 +371,7 @@ static void __inet_hash(struct sock *sk)
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];

spin_lock(&ilb->lock);
__sk_add_node(sk, &ilb->head);
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
}
Expand All @@ -388,26 +389,22 @@ EXPORT_SYMBOL_GPL(inet_hash);
void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
spinlock_t *lock;
int done;

if (sk_unhashed(sk))
return;

if (sk->sk_state == TCP_LISTEN) {
struct inet_listen_hashbucket *ilb;
if (sk->sk_state == TCP_LISTEN)
lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
else
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
spin_lock_bh(&ilb->lock);
if (__sk_del_node_init(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(&ilb->lock);
} else {
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

spin_lock_bh(lock);
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
}
spin_lock_bh(lock);
done =__sk_nulls_del_node_init_rcu(sk);
spin_unlock_bh(lock);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}
EXPORT_SYMBOL_GPL(inet_unhash);

Expand Down Expand Up @@ -526,8 +523,11 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
{
int i;

for (i = 0; i < INET_LHTABLE_SIZE; i++)
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
i + LISTENING_NULLS_BASE);
}
}

EXPORT_SYMBOL_GPL(inet_hashinfo_init);
8 changes: 4 additions & 4 deletions net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -1868,7 +1868,7 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
static void *listening_get_next(struct seq_file *seq, void *cur)
{
struct inet_connection_sock *icsk;
struct hlist_node *node;
struct hlist_nulls_node *node;
struct sock *sk = cur;
struct inet_listen_hashbucket *ilb;
struct tcp_iter_state *st = seq->private;
Expand All @@ -1878,7 +1878,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
st->bucket = 0;
ilb = &tcp_hashinfo.listening_hash[0];
spin_lock_bh(&ilb->lock);
sk = sk_head(&ilb->head);
sk = sk_nulls_head(&ilb->head);
goto get_sk;
}
ilb = &tcp_hashinfo.listening_hash[st->bucket];
Expand Down Expand Up @@ -1914,7 +1914,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
sk = sk_next(sk);
}
get_sk:
sk_for_each_from(sk, node) {
sk_nulls_for_each_from(sk, node) {
if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
cur = sk;
goto out;
Expand All @@ -1935,7 +1935,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (++st->bucket < INET_LHTABLE_SIZE) {
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
sk = sk_head(&ilb->head);
sk = sk_nulls_head(&ilb->head);
goto get_sk;
}
cur = NULL;
Expand Down
94 changes: 59 additions & 35 deletions net/ipv6/inet6_hashtables.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void __inet6_hash(struct sock *sk)

ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
spin_lock(&ilb->lock);
__sk_add_node(sk, &ilb->head);
__sk_nulls_add_node_rcu(sk, &ilb->head);
spin_unlock(&ilb->lock);
} else {
unsigned int hash;
Expand Down Expand Up @@ -118,47 +118,71 @@ struct sock *__inet6_lookup_established(struct net *net,
}
EXPORT_SYMBOL(__inet6_lookup_established);

static int inline compute_score(struct sock *sk, struct net *net,
const unsigned short hnum,
const struct in6_addr *daddr,
const int dif)
{
int score = -1;

if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
sk->sk_family == PF_INET6) {
const struct ipv6_pinfo *np = inet6_sk(sk);

score = 1;
if (!ipv6_addr_any(&np->rcv_saddr)) {
if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
return -1;
score++;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
score++;
}
}
return score;
}

struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo, const struct in6_addr *daddr,
const unsigned short hnum, const int dif)
{
struct sock *sk;
const struct hlist_node *node;
struct sock *result = NULL;
int score, hiscore = 0;
struct inet_listen_hashbucket *ilb;

ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
spin_lock(&ilb->lock);
sk_for_each(sk, node, &ilb->head) {
if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
sk->sk_family == PF_INET6) {
const struct ipv6_pinfo *np = inet6_sk(sk);

score = 1;
if (!ipv6_addr_any(&np->rcv_saddr)) {
if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
continue;
score++;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
continue;
score++;
}
if (score == 3) {
result = sk;
break;
}
if (score > hiscore) {
hiscore = score;
result = sk;
}
const struct hlist_nulls_node *node;
struct sock *result;
int score, hiscore;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];

rcu_read_lock();
begin:
result = NULL;
hiscore = -1;
sk_nulls_for_each(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
hiscore = score;
result = sk;
}
}
if (result)
sock_hold(result);
spin_unlock(&ilb->lock);
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
goto begin;
if (result) {
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, daddr,
dif) < hiscore)) {
sock_put(result);
goto begin;
}
}
rcu_read_unlock();
return result;
}

Expand Down

0 comments on commit c25eb3b

Please sign in to comment.