Skip to content

Commit

Permalink
Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf
Browse files Browse the repository at this point in the history
Pablo Neira Ayuso says:

====================
Netfilter/IPVS fixes for net

1) ipset limits the max allocatable memory via kvmalloc() to MAX_INT,
   from Jozsef Kadlecsik.

2) Check ip_vs_conn_tab_bits value to be in the range specified
   in Kconfig, from Andrea Claudi.

3) Initialize fragment offset in ip6tables, from Jeremy Sowden.

4) Make conntrack hash chain length random, from Florian Westphal.

5) Add zone ID to conntrack and NAT hashtuple again, also from Florian.

6) Add selftests for bidirectional zone support and colliding tuples,
   from Florian Westphal.

7) Unlink table before synchronize_rcu when cleaning tables with
   owner, from Florian.

8) ipset limits the max allocatable memory via kvmalloc() to MAX_INT.

9) Release conntrack entries via workqueue in masquerade, from Florian.

10) Fix bogus net_init in iptables raw table definition, also from Florian.

11) Work around missing softdep in log extensions, from Florian Westphal.

12) Serialize hash resizes and cleanups with mutex, from Eric Dumazet.

* git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf:
  netfilter: conntrack: serialize hash resizes and cleanups
  netfilter: log: work around missing softdep backend module
  netfilter: iptable_raw: drop bogus net_init annotation
  netfilter: nf_nat_masquerade: defer conntrack walk to work queue
  netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic
  netfilter: nf_tables: Fix oversized kvmalloc() calls
  netfilter: nf_tables: unlink table before deleting it
  selftests: netfilter: add zone stress test with colliding tuples
  selftests: netfilter: add selftest for directional zone support
  netfilter: nat: include zone id in nat table hash again
  netfilter: conntrack: include zone id in tuple hash again
  netfilter: conntrack: make max chain length random
  netfilter: ip6_tables: zero-initialize fragment offset
  ipvs: check that ip_vs_conn_tab_bits is between 8 and 20
  netfilter: ipset: Fix oversized kvmalloc() calls
====================

Link: https://lore.kernel.org/r/20210924221113.348767-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Sep 25, 2021
2 parents 4526fe7 + e9edc18 commit 7fe7f31
Show file tree
Hide file tree
Showing 13 changed files with 735 additions and 147 deletions.
2 changes: 1 addition & 1 deletion net/ipv4/netfilter/iptable_raw.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ iptable_raw_hook(void *priv, struct sk_buff *skb,

static struct nf_hook_ops *rawtable_ops __read_mostly;

static int __net_init iptable_raw_table_init(struct net *net)
static int iptable_raw_table_init(struct net *net)
{
struct ipt_replace *repl;
const struct xt_table *table = &packet_raw;
Expand Down
1 change: 1 addition & 0 deletions net/ipv6/netfilter/ip6_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ ip6t_do_table(struct sk_buff *skb,
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
acpar.fragoff = 0;
acpar.hotdrop = false;
acpar.state = state;

Expand Down
4 changes: 2 additions & 2 deletions net/netfilter/ipset/ip_set_hash_gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,11 @@ htable_size(u8 hbits)
{
size_t hsize;

/* We must fit both into u32 in jhash and size_t */
/* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */
if (hbits > 31)
return 0;
hsize = jhash_size(hbits);
if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *)
if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *)
< hsize)
return 0;

Expand Down
4 changes: 4 additions & 0 deletions net/netfilter/ipvs/ip_vs_conn.c
Original file line number Diff line number Diff line change
Expand Up @@ -1468,6 +1468,10 @@ int __init ip_vs_conn_init(void)
int idx;

/* Compute size and mask */
if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) {
pr_info("conn_tab_bits not in [8, 20]. Using default value\n");
ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
}
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;

Expand Down
154 changes: 100 additions & 54 deletions net/netfilter/nf_conntrack_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,14 @@ static __read_mostly struct kmem_cache *nf_conntrack_cachep;
static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;

/* serialize hash resizes and nf_ct_iterate_cleanup */
static DEFINE_MUTEX(nf_conntrack_mutex);

#define GC_SCAN_INTERVAL (120u * HZ)
#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)

#define MAX_CHAINLEN 64u
#define MIN_CHAINLEN 8u
#define MAX_CHAINLEN (32u - MIN_CHAINLEN)

static struct conntrack_gc_work conntrack_gc_work;

Expand Down Expand Up @@ -188,11 +192,13 @@ seqcount_spinlock_t nf_conntrack_generation __read_mostly;
static siphash_key_t nf_conntrack_hash_rnd __read_mostly;

static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
unsigned int zoneid,
const struct net *net)
{
struct {
struct nf_conntrack_man src;
union nf_inet_addr dst_addr;
unsigned int zone;
u32 net_mix;
u16 dport;
u16 proto;
Expand All @@ -205,6 +211,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
/* The direction must be ignored, so handle usable members manually. */
combined.src = tuple->src;
combined.dst_addr = tuple->dst.u3;
combined.zone = zoneid;
combined.net_mix = net_hash_mix(net);
combined.dport = (__force __u16)tuple->dst.u.all;
combined.proto = tuple->dst.protonum;
Expand All @@ -219,15 +226,17 @@ static u32 scale_hash(u32 hash)

static u32 __hash_conntrack(const struct net *net,
const struct nf_conntrack_tuple *tuple,
unsigned int zoneid,
unsigned int size)
{
return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size);
}

static u32 hash_conntrack(const struct net *net,
const struct nf_conntrack_tuple *tuple)
const struct nf_conntrack_tuple *tuple,
unsigned int zoneid)
{
return scale_hash(hash_conntrack_raw(tuple, net));
return scale_hash(hash_conntrack_raw(tuple, zoneid, net));
}

static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
Expand Down Expand Up @@ -650,9 +659,11 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

clean_from_lists(ct);
Expand Down Expand Up @@ -819,8 +830,20 @@ struct nf_conntrack_tuple_hash *
nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
return __nf_conntrack_find_get(net, zone, tuple,
hash_conntrack_raw(tuple, net));
unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
struct nf_conntrack_tuple_hash *thash;

thash = __nf_conntrack_find_get(net, zone, tuple,
hash_conntrack_raw(tuple, zone_id, net));

if (thash)
return thash;

rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
if (rid != zone_id)
return __nf_conntrack_find_get(net, zone, tuple,
hash_conntrack_raw(tuple, rid, net));
return thash;
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);

Expand All @@ -842,6 +865,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
unsigned int max_chainlen;
unsigned int chainlen = 0;
unsigned int sequence;
int err = -EEXIST;
Expand All @@ -852,18 +876,22 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);

/* See if there's one in the list already, including reverse */
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;

if (chainlen++ > MAX_CHAINLEN)
if (chainlen++ > max_chainlen)
goto chaintoolong;
}

Expand All @@ -873,7 +901,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
if (chainlen++ > MAX_CHAINLEN)
if (chainlen++ > max_chainlen)
goto chaintoolong;
}

Expand Down Expand Up @@ -1103,8 +1131,8 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
unsigned int chainlen = 0, sequence, max_chainlen;
const struct nf_conntrack_zone *zone;
unsigned int chainlen = 0, sequence;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
Expand Down Expand Up @@ -1133,8 +1161,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);

&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

/* We're not in hash table, and we refuse to set up related
Expand Down Expand Up @@ -1168,14 +1196,15 @@ __nf_conntrack_confirm(struct sk_buff *skb)
goto dying;
}

max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
if (chainlen++ > MAX_CHAINLEN)
if (chainlen++ > max_chainlen)
goto chaintoolong;
}

Expand All @@ -1184,7 +1213,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
if (chainlen++ > MAX_CHAINLEN) {
if (chainlen++ > max_chainlen) {
chaintoolong:
nf_ct_add_to_dying_list(ct);
NF_CT_STAT_INC(net, chaintoolong);
Expand Down Expand Up @@ -1246,7 +1275,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
rcu_read_lock();
begin:
nf_conntrack_get_ht(&ct_hash, &hsize);
hash = __hash_conntrack(net, tuple, hsize);
hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize);

hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
Expand Down Expand Up @@ -1687,8 +1716,8 @@ resolve_normal_ct(struct nf_conn *tmpl,
struct nf_conntrack_tuple_hash *h;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
u32 hash, zone_id, rid;
struct nf_conn *ct;
u32 hash;

if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, state->pf, protonum, state->net,
Expand All @@ -1699,8 +1728,20 @@ resolve_normal_ct(struct nf_conn *tmpl,

/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
hash = hash_conntrack_raw(&tuple, state->net);

zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
hash = hash_conntrack_raw(&tuple, zone_id, state->net);
h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);

if (!h) {
rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
if (zone_id != rid) {
u32 tmp = hash_conntrack_raw(&tuple, rid, state->net);

h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp);
}
}

if (!h) {
h = init_conntrack(state->net, tmpl, &tuple,
skb, dataoff, hash);
Expand Down Expand Up @@ -2225,28 +2266,31 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
spinlock_t *lockp;

for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];

if (hlist_nulls_empty(hslot))
continue;

lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
nf_conntrack_lock(lockp);
if (*bucket < nf_conntrack_htable_size) {
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
continue;
/* All nf_conn objects are added to hash table twice, one
* for original direction tuple, once for the reply tuple.
*
* Exception: In the IPS_NAT_CLASH case, only the reply
* tuple is added (the original tuple already existed for
* a different object).
*
* We only need to call the iterator once for each
* conntrack, so we just use the 'reply' direction
* tuple while iterating.
*/
ct = nf_ct_tuplehash_to_ctrack(h);
if (iter(ct, data))
goto found;
}
hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
continue;
/* All nf_conn objects are added to hash table twice, one
* for original direction tuple, once for the reply tuple.
*
* Exception: In the IPS_NAT_CLASH case, only the reply
* tuple is added (the original tuple already existed for
* a different object).
*
* We only need to call the iterator once for each
* conntrack, so we just use the 'reply' direction
* tuple while iterating.
*/
ct = nf_ct_tuplehash_to_ctrack(h);
if (iter(ct, data))
goto found;
}
spin_unlock(lockp);
local_bh_enable();
Expand All @@ -2264,26 +2308,20 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
void *data, u32 portid, int report)
{
unsigned int bucket = 0, sequence;
unsigned int bucket = 0;
struct nf_conn *ct;

might_sleep();

for (;;) {
sequence = read_seqcount_begin(&nf_conntrack_generation);

while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
mutex_lock(&nf_conntrack_mutex);
while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
/* Time to push up daises... */

nf_ct_delete(ct, portid, report);
nf_ct_put(ct);
cond_resched();
}

if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
break;
bucket = 0;
nf_ct_delete(ct, portid, report);
nf_ct_put(ct);
cond_resched();
}
mutex_unlock(&nf_conntrack_mutex);
}

struct iter_data {
Expand Down Expand Up @@ -2519,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
if (!hash)
return -ENOMEM;

mutex_lock(&nf_conntrack_mutex);
old_size = nf_conntrack_htable_size;
if (old_size == hashsize) {
mutex_unlock(&nf_conntrack_mutex);
kvfree(hash);
return 0;
}
Expand All @@ -2537,12 +2577,16 @@ int nf_conntrack_hash_resize(unsigned int hashsize)

for (i = 0; i < nf_conntrack_htable_size; i++) {
while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
unsigned int zone_id;

h = hlist_nulls_entry(nf_conntrack_hash[i].first,
struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);

zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h));
bucket = __hash_conntrack(nf_ct_net(ct),
&h->tuple, hashsize);
&h->tuple, zone_id, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
Expand All @@ -2556,6 +2600,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
nf_conntrack_all_unlock();
local_bh_enable();

mutex_unlock(&nf_conntrack_mutex);

synchronize_net();
kvfree(old_hash);
return 0;
Expand Down
Loading

0 comments on commit 7fe7f31

Please sign in to comment.