Skip to content

Commit

Permalink
netfilter: nf_conntrack: fix hash resizing with namespaces
Browse files Browse the repository at this point in the history
As noticed by Jon Masters <jonathan@jonmasters.org>, the conntrack hash
size is global and not per namespace, but modifiable at runtime through
/sys/module/nf_conntrack/hashsize. Changing the hash size will only
resize the hash in the current namespace however, so other namespaces
will use an invalid hash size. This can cause crashes when enlarging
the hashsize, or false negative lookups when shrinking it.

Move the hash size into the per-namespace data and only use the global
hash size to initialize the per-namespace value when instanciating a
new namespace. Additionally restrict hash resizing to init_net for
now as other namespaces are not handled currently.

Cc: stable@kernel.org
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Patrick McHardy authored and David S. Miller committed Feb 8, 2010
1 parent 14c7dbe commit d696c7b
Show file tree
Hide file tree
Showing 10 changed files with 49 additions and 47 deletions.
1 change: 1 addition & 0 deletions include/net/netns/conntrack.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ struct nf_conntrack_ecache;
struct netns_ct {
atomic_t count;
unsigned int expect_count;
unsigned int htable_size;
struct kmem_cache *nf_conntrack_cachep;
struct hlist_nulls_head *hash;
struct hlist_head *expect_hash;
Expand Down
1 change: 1 addition & 0 deletions include/net/netns/ipv4.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ struct netns_ipv4 {
struct xt_table *iptable_security;
struct xt_table *nat_table;
struct hlist_head *nat_bysource;
unsigned int nat_htable_size;
int nat_vmalloced;
#endif

Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ static ctl_table ip_ct_sysctl_table[] = {
},
{
.procname = "ip_conntrack_buckets",
.data = &nf_conntrack_htable_size,
.data = &init_net.ct.htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
Expand Down
4 changes: 2 additions & 2 deletions net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
struct hlist_nulls_node *n;

for (st->bucket = 0;
st->bucket < nf_conntrack_htable_size;
st->bucket < net->ct.htable_size;
st->bucket++) {
n = rcu_dereference(net->ct.hash[st->bucket].first);
if (!is_a_nulls(n))
Expand All @@ -50,7 +50,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
head = rcu_dereference(head->next);
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= nf_conntrack_htable_size)
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
head = rcu_dereference(net->ct.hash[st->bucket].first);
Expand Down
22 changes: 9 additions & 13 deletions net/ipv4/netfilter/nf_nat_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@ static DEFINE_SPINLOCK(nf_nat_lock);

static struct nf_conntrack_l3proto *l3proto __read_mostly;

/* Calculated at init based on memory size */
static unsigned int nf_nat_htable_size __read_mostly;

#define MAX_IP_NAT_PROTO 256
static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
__read_mostly;
Expand Down Expand Up @@ -72,15 +69,15 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put);

/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
hash_by_src(const struct nf_conntrack_tuple *tuple)
hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;

/* Original src, to ensure we map it consistently if poss. */
hash = jhash_3words((__force u32)tuple->src.u3.ip,
(__force u32)tuple->src.u.all,
tuple->dst.protonum, 0);
return ((u64)hash * nf_nat_htable_size) >> 32;
return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
}

/* Is this tuple already taken? (not by us) */
Expand Down Expand Up @@ -147,7 +144,7 @@ find_appropriate_src(struct net *net,
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
unsigned int h = hash_by_src(tuple);
unsigned int h = hash_by_src(net, tuple);
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
const struct hlist_node *n;
Expand Down Expand Up @@ -330,7 +327,7 @@ nf_nat_setup_info(struct nf_conn *ct,
if (have_to_hash) {
unsigned int srchash;

srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_lock);
/* nf_conntrack_alter_reply might re-allocate exntension aera */
nat = nfct_nat(ct);
Expand Down Expand Up @@ -679,8 +676,10 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,

static int __net_init nf_nat_net_init(struct net *net)
{
net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
&net->ipv4.nat_vmalloced, 0);
/* Leave them the same for the moment. */
net->ipv4.nat_htable_size = net->ct.htable_size;
net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
&net->ipv4.nat_vmalloced, 0);
if (!net->ipv4.nat_bysource)
return -ENOMEM;
return 0;
Expand All @@ -703,7 +702,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
nf_ct_iterate_cleanup(net, &clean_nat, NULL);
synchronize_rcu();
nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
nf_nat_htable_size);
net->ipv4.nat_htable_size);
}

static struct pernet_operations nf_nat_net_ops = {
Expand All @@ -724,9 +723,6 @@ static int __init nf_nat_init(void)
return ret;
}

/* Leave them the same for the moment. */
nf_nat_htable_size = nf_conntrack_htable_size;

ret = register_pernet_subsys(&nf_nat_net_ops);
if (ret < 0)
goto cleanup_extend;
Expand Down
53 changes: 28 additions & 25 deletions net/netfilter/nf_conntrack_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <linux/netdevice.h>
#include <linux/socket.h>
#include <linux/mm.h>
#include <linux/nsproxy.h>
#include <linux/rculist_nulls.h>

#include <net/netfilter/nf_conntrack.h>
Expand Down Expand Up @@ -84,9 +85,10 @@ static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
return ((u64)h * size) >> 32;
}

static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
static inline u_int32_t hash_conntrack(const struct net *net,
const struct nf_conntrack_tuple *tuple)
{
return __hash_conntrack(tuple, nf_conntrack_htable_size,
return __hash_conntrack(tuple, net->ct.htable_size,
nf_conntrack_hash_rnd);
}

Expand Down Expand Up @@ -294,7 +296,7 @@ __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
unsigned int hash = hash_conntrack(tuple);
unsigned int hash = hash_conntrack(net, tuple);

/* Disable BHs the entire time since we normally need to disable them
* at least once for the stats anyway.
Expand Down Expand Up @@ -364,10 +366,11 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,

void nf_conntrack_hash_insert(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
unsigned int hash, repl_hash;

hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

__nf_conntrack_hash_insert(ct, hash, repl_hash);
}
Expand Down Expand Up @@ -395,8 +398,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return NF_ACCEPT;

hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

/* We're not in hash table, and we refuse to set up related
connections for unconfirmed conns. But packet copies and
Expand Down Expand Up @@ -466,7 +469,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
struct net *net = nf_ct_net(ignored_conntrack);
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
unsigned int hash = hash_conntrack(tuple);
unsigned int hash = hash_conntrack(net, tuple);

/* Disable BHs the entire time since we need to disable them at
* least once for the stats anyway.
Expand Down Expand Up @@ -501,7 +504,7 @@ static noinline int early_drop(struct net *net, unsigned int hash)
int dropped = 0;

rcu_read_lock();
for (i = 0; i < nf_conntrack_htable_size; i++) {
for (i = 0; i < net->ct.htable_size; i++) {
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
Expand All @@ -521,7 +524,7 @@ static noinline int early_drop(struct net *net, unsigned int hash)
if (cnt >= NF_CT_EVICTION_RANGE)
break;

hash = (hash + 1) % nf_conntrack_htable_size;
hash = (hash + 1) % net->ct.htable_size;
}
rcu_read_unlock();

Expand Down Expand Up @@ -555,7 +558,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,

if (nf_conntrack_max &&
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
unsigned int hash = hash_conntrack(orig);
unsigned int hash = hash_conntrack(net, orig);
if (!early_drop(net, hash)) {
atomic_dec(&net->ct.count);
if (net_ratelimit())
Expand Down Expand Up @@ -1012,7 +1015,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
struct hlist_nulls_node *n;

spin_lock_bh(&nf_conntrack_lock);
for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
for (; *bucket < net->ct.htable_size; (*bucket)++) {
hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (iter(ct, data))
Expand Down Expand Up @@ -1130,7 +1133,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
}

nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
net->ct.htable_size);
nf_conntrack_ecache_fini(net);
nf_conntrack_acct_fini(net);
nf_conntrack_expect_fini(net);
Expand Down Expand Up @@ -1190,10 +1193,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
{
int i, bucket, vmalloced, old_vmalloced;
unsigned int hashsize, old_size;
int rnd;
struct hlist_nulls_head *hash, *old_hash;
struct nf_conntrack_tuple_hash *h;

if (current->nsproxy->net_ns != &init_net)
return -EOPNOTSUPP;

/* On boot, we can set this without any fancy locking. */
if (!nf_conntrack_htable_size)
return param_set_uint(val, kp);
Expand All @@ -1206,33 +1211,29 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
if (!hash)
return -ENOMEM;

/* We have to rehahs for the new table anyway, so we also can
* use a newrandom seed */
get_random_bytes(&rnd, sizeof(rnd));

/* Lookups in the old hash might happen in parallel, which means we
* might get false negatives during connection lookup. New connections
* created because of a false negative won't make it into the hash
* though since that required taking the lock.
*/
spin_lock_bh(&nf_conntrack_lock);
for (i = 0; i < nf_conntrack_htable_size; i++) {
for (i = 0; i < init_net.ct.htable_size; i++) {
while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
h = hlist_nulls_entry(init_net.ct.hash[i].first,
struct nf_conntrack_tuple_hash, hnnode);
hlist_nulls_del_rcu(&h->hnnode);
bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
bucket = __hash_conntrack(&h->tuple, hashsize,
nf_conntrack_hash_rnd);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
old_size = nf_conntrack_htable_size;
old_size = init_net.ct.htable_size;
old_vmalloced = init_net.ct.hash_vmalloc;
old_hash = init_net.ct.hash;

nf_conntrack_htable_size = hashsize;
init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
init_net.ct.hash_vmalloc = vmalloced;
init_net.ct.hash = hash;
nf_conntrack_hash_rnd = rnd;
spin_unlock_bh(&nf_conntrack_lock);

nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
Expand Down Expand Up @@ -1328,7 +1329,9 @@ static int nf_conntrack_init_net(struct net *net)
ret = -ENOMEM;
goto err_cache;
}
net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,

net->ct.htable_size = nf_conntrack_htable_size;
net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size,
&net->ct.hash_vmalloc, 1);
if (!net->ct.hash) {
ret = -ENOMEM;
Expand All @@ -1353,7 +1356,7 @@ static int nf_conntrack_init_net(struct net *net)
nf_conntrack_expect_fini(net);
err_expect:
nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
net->ct.htable_size);
err_hash:
kmem_cache_destroy(net->ct.nf_conntrack_cachep);
err_cache:
Expand Down
2 changes: 1 addition & 1 deletion net/netfilter/nf_conntrack_expect.c
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ int nf_conntrack_expect_init(struct net *net)

if (net_eq(net, &init_net)) {
if (!nf_ct_expect_hsize) {
nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
nf_ct_expect_hsize = net->ct.htable_size / 256;
if (!nf_ct_expect_hsize)
nf_ct_expect_hsize = 1;
}
Expand Down
2 changes: 1 addition & 1 deletion net/netfilter/nf_conntrack_helper.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
/* Get rid of expecteds, set helpers to NULL. */
hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode)
unhelp(h, me);
for (i = 0; i < nf_conntrack_htable_size; i++) {
for (i = 0; i < net->ct.htable_size; i++) {
hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
unhelp(h, me);
}
Expand Down
2 changes: 1 addition & 1 deletion net/netfilter/nf_conntrack_netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)

rcu_read_lock();
last = (struct nf_conn *)cb->args[1];
for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
for (; cb->args[0] < init_net.ct.htable_size; cb->args[0]++) {
restart:
hlist_nulls_for_each_entry_rcu(h, n, &init_net.ct.hash[cb->args[0]],
hnnode) {
Expand Down
7 changes: 4 additions & 3 deletions net/netfilter/nf_conntrack_standalone.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
struct hlist_nulls_node *n;

for (st->bucket = 0;
st->bucket < nf_conntrack_htable_size;
st->bucket < net->ct.htable_size;
st->bucket++) {
n = rcu_dereference(net->ct.hash[st->bucket].first);
if (!is_a_nulls(n))
Expand All @@ -69,7 +69,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
head = rcu_dereference(head->next);
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= nf_conntrack_htable_size)
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
head = rcu_dereference(net->ct.hash[st->bucket].first);
Expand Down Expand Up @@ -355,7 +355,7 @@ static ctl_table nf_ct_sysctl_table[] = {
},
{
.procname = "nf_conntrack_buckets",
.data = &nf_conntrack_htable_size,
.data = &init_net.ct.htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
Expand Down Expand Up @@ -421,6 +421,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
goto out_kmemdup;

table[1].data = &net->ct.count;
table[2].data = &net->ct.htable_size;
table[3].data = &net->ct.sysctl_checksum;
table[4].data = &net->ct.sysctl_log_invalid;

Expand Down

0 comments on commit d696c7b

Please sign in to comment.