Skip to content

Commit

Permalink
netfilter: conntrack: use a single hashtable for all namespaces
Browse files Browse the repository at this point in the history
We already include netns address in the hash and compare the netns pointers
during lookup, so even if namespaces have overlapping addresses entries
will be spread across the table.

Assuming 64k bucket size, this change saves 0.5 mbyte per namespace on a
64bit system.

NAT bysrc and expectation hash is still per namespace, those will
changed too soon.

Future patch will also make conntrack object slab cache global again.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
  • Loading branch information
Florian Westphal authored and Pablo Neira Ayuso committed May 5, 2016
1 parent 1b8c8a9 commit 56d52d4
Show file tree
Hide file tree
Showing 10 changed files with 62 additions and 68 deletions.
1 change: 1 addition & 0 deletions include/net/netfilter/nf_conntrack_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,

#define CONNTRACK_LOCKS 1024

extern struct hlist_nulls_head *nf_conntrack_hash;
extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
void nf_conntrack_lock(spinlock_t *lock);

Expand Down
2 changes: 0 additions & 2 deletions include/net/netns/conntrack.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,7 @@ struct netns_ct {
int sysctl_tstamp;
int sysctl_checksum;

unsigned int htable_size;
struct kmem_cache *nf_conntrack_cachep;
struct hlist_nulls_head *hash;
struct hlist_head *expect_hash;
struct ct_pcpu __percpu *pcpu_lists;
struct ip_conntrack_stat __percpu *stat;
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ static int ipv4_init_net(struct net *net)

in->ctl_table[0].data = &nf_conntrack_max;
in->ctl_table[1].data = &net->ct.count;
in->ctl_table[2].data = &net->ct.htable_size;
in->ctl_table[2].data = &nf_conntrack_htable_size;
in->ctl_table[3].data = &net->ct.sysctl_checksum;
in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
#endif
Expand Down
10 changes: 4 additions & 6 deletions net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,14 @@ struct ct_iter_state {

static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
struct hlist_nulls_node *n;

for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket < nf_conntrack_htable_size;
st->bucket++) {
n = rcu_dereference(
hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
Expand All @@ -49,17 +48,16 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct hlist_nulls_node *head)
{
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;

head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
if (++st->bucket >= nf_conntrack_htable_size)
return NULL;
}
head = rcu_dereference(
hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
}
return head;
}
Expand Down
80 changes: 40 additions & 40 deletions net/netfilter/nf_conntrack_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);

struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_hash);

static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
static __read_mostly seqcount_t nf_conntrack_generation;
static __read_mostly bool nf_conntrack_locks_all;
Expand Down Expand Up @@ -164,9 +167,9 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
tuple->dst.protonum));
}

static u32 hash_bucket(u32 hash, const struct net *net)
static u32 scale_hash(u32 hash)
{
return reciprocal_scale(hash, net->ct.htable_size);
return reciprocal_scale(hash, nf_conntrack_htable_size);
}

static u32 __hash_conntrack(const struct net *net,
Expand All @@ -179,7 +182,7 @@ static u32 __hash_conntrack(const struct net *net,
static u32 hash_conntrack(const struct net *net,
const struct nf_conntrack_tuple *tuple)
{
return __hash_conntrack(net, tuple, net->ct.htable_size);
return scale_hash(hash_conntrack_raw(tuple, net));
}

bool
Expand Down Expand Up @@ -478,8 +481,8 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
begin:
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
bucket = hash_bucket(hash, net);
ct_hash = net->ct.hash;
bucket = scale_hash(hash);
ct_hash = nf_conntrack_hash;
} while (read_seqcount_retry(&nf_conntrack_generation, sequence));

hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
Expand Down Expand Up @@ -543,12 +546,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
unsigned int hash,
unsigned int reply_hash)
{
struct net *net = nf_ct_net(ct);

hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
&net->ct.hash[hash]);
&nf_conntrack_hash[hash]);
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
&net->ct.hash[reply_hash]);
&nf_conntrack_hash[reply_hash]);
}

int
Expand All @@ -573,12 +574,12 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

/* See if there's one in the list already, including reverse */
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;

hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
Expand Down Expand Up @@ -633,7 +634,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
sequence = read_seqcount_begin(&nf_conntrack_generation);
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = hash_bucket(hash, net);
hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);

Expand Down Expand Up @@ -663,12 +664,12 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;

hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
Expand Down Expand Up @@ -736,7 +737,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net, tuple);
ct_hash = net->ct.hash;
ct_hash = nf_conntrack_hash;
} while (read_seqcount_retry(&nf_conntrack_generation, sequence));

hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
Expand Down Expand Up @@ -773,16 +774,16 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
local_bh_disable();
restart:
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_bucket(_hash, net);
for (; i < net->ct.htable_size; i++) {
hash = scale_hash(_hash);
for (; i < nf_conntrack_htable_size; i++) {
lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
spin_unlock(lockp);
goto restart;
}
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
hnnode) {
hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
!nf_ct_is_dying(tmp) &&
Expand All @@ -793,7 +794,7 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
cnt++;
}

hash = (hash + 1) % net->ct.htable_size;
hash = (hash + 1) % nf_conntrack_htable_size;
spin_unlock(lockp);

if (ct || cnt >= NF_CT_EVICTION_RANGE)
Expand Down Expand Up @@ -1376,12 +1377,12 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
int cpu;
spinlock_t *lockp;

for (; *bucket < net->ct.htable_size; (*bucket)++) {
for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
nf_conntrack_lock(lockp);
if (*bucket < net->ct.htable_size) {
hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
if (*bucket < nf_conntrack_htable_size) {
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
Expand Down Expand Up @@ -1478,6 +1479,8 @@ void nf_conntrack_cleanup_end(void)
while (untrack_refs() > 0)
schedule();

nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);

#ifdef CONFIG_NF_CONNTRACK_ZONES
nf_ct_extend_unregister(&nf_ct_zone_extend);
#endif
Expand Down Expand Up @@ -1528,7 +1531,6 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
}

list_for_each_entry(net, net_exit_list, exit_list) {
nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
nf_conntrack_proto_pernet_fini(net);
nf_conntrack_helper_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
Expand Down Expand Up @@ -1599,22 +1601,22 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
* though since that required taking the locks.
*/

for (i = 0; i < init_net.ct.htable_size; i++) {
while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
h = hlist_nulls_entry(init_net.ct.hash[i].first,
struct nf_conntrack_tuple_hash, hnnode);
for (i = 0; i < nf_conntrack_htable_size; i++) {
while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
h = hlist_nulls_entry(nf_conntrack_hash[i].first,
struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
bucket = __hash_conntrack(nf_ct_net(ct),
&h->tuple, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
old_size = init_net.ct.htable_size;
old_hash = init_net.ct.hash;
old_size = nf_conntrack_htable_size;
old_hash = nf_conntrack_hash;

init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
init_net.ct.hash = hash;
nf_conntrack_hash = hash;
nf_conntrack_htable_size = hashsize;

write_seqcount_end(&nf_conntrack_generation);
nf_conntrack_all_unlock();
Expand Down Expand Up @@ -1670,6 +1672,11 @@ int nf_conntrack_init_start(void)
* entries. */
max_factor = 4;
}

nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
if (!nf_conntrack_hash)
return -ENOMEM;

nf_conntrack_max = max_factor * nf_conntrack_htable_size;

printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
Expand Down Expand Up @@ -1748,6 +1755,7 @@ int nf_conntrack_init_start(void)
err_acct:
nf_conntrack_expect_fini();
err_expect:
nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
return ret;
}

Expand Down Expand Up @@ -1800,12 +1808,6 @@ int nf_conntrack_init_net(struct net *net)
goto err_cache;
}

net->ct.htable_size = nf_conntrack_htable_size;
net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
if (!net->ct.hash) {
printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
goto err_hash;
}
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
goto err_expect;
Expand Down Expand Up @@ -1837,8 +1839,6 @@ int nf_conntrack_init_net(struct net *net)
err_acct:
nf_conntrack_expect_pernet_fini(net);
err_expect:
nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
err_hash:
kmem_cache_destroy(net->ct.nf_conntrack_cachep);
err_cache:
kfree(net->ct.slabname);
Expand Down
6 changes: 3 additions & 3 deletions net/netfilter/nf_conntrack_helper.c
Original file line number Diff line number Diff line change
Expand Up @@ -424,10 +424,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
spin_unlock_bh(&pcpu->lock);
}
local_bh_disable();
for (i = 0; i < net->ct.htable_size; i++) {
for (i = 0; i < nf_conntrack_htable_size; i++) {
nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
if (i < net->ct.htable_size) {
hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
if (i < nf_conntrack_htable_size) {
hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
unhelp(h, me);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
Expand Down
8 changes: 4 additions & 4 deletions net/netfilter/nf_conntrack_netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -824,16 +824,16 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conn *)cb->args[1];

local_bh_disable();
for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
restart:
lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
if (cb->args[0] >= net->ct.htable_size) {
if (cb->args[0] >= nf_conntrack_htable_size) {
spin_unlock(lockp);
goto out;
}
hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
hnnode) {
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
Expand Down
13 changes: 5 additions & 8 deletions net/netfilter/nf_conntrack_standalone.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,13 @@ struct ct_iter_state {

static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
struct hlist_nulls_node *n;

for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket < nf_conntrack_htable_size;
st->bucket++) {
n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
Expand All @@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct hlist_nulls_node *head)
{
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;

head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
if (++st->bucket >= nf_conntrack_htable_size)
return NULL;
}
head = rcu_dereference(
hlist_nulls_first_rcu(
&net->ct.hash[st->bucket]));
&nf_conntrack_hash[st->bucket]));
}
return head;
}
Expand Down Expand Up @@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
},
{
.procname = "nf_conntrack_buckets",
.data = &init_net.ct.htable_size,
.data = &nf_conntrack_htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
Expand Down Expand Up @@ -512,7 +510,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
goto out_kmemdup;

table[1].data = &net->ct.count;
table[2].data = &net->ct.htable_size;
table[3].data = &net->ct.sysctl_checksum;
table[4].data = &net->ct.sysctl_log_invalid;

Expand Down
Loading

0 comments on commit 56d52d4

Please sign in to comment.