Skip to content

Commit

Permalink
netfilter: conntrack: get rid of conntrack timer
Browse files Browse the repository at this point in the history
With stats enabled this eats 80 bytes on x86_64 per nf_conn entry, as
Eric Dumazet pointed out during netfilter workshop 2016.

Eric also says: "Another reason was the fact that Thomas was about to
change max timer range [..]" (500462a, 'timers: Switch to
a non-cascading wheel').

Remove the timer and use a 32bit jiffies value containing timestamp until
entry is valid.

During conntrack lookup, even before doing tuple comparision, check
the timeout value and evict the entry in case it is too old.

The dying bit is used as a synchronization point to avoid races where
multiple cpus try to evict the same entry.

Because lookup is always lockless, we need to bump the refcnt once
when we evict, else we could try to evict already-dead entry that
is being recycled.

This is the standard/expected way when conntrack entries are destroyed.

Followup patches will introduce garbage colliction via work queue
and further places where we can reap obsoleted entries (e.g. during
netlink dumps), this is needed to avoid expired conntracks from hanging
around for too long when lookup rate is low after a busy period.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
  • Loading branch information
Florian Westphal authored and Pablo Neira Ayuso committed Aug 30, 2016
1 parent 616b14b commit f330a7f
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 63 deletions.
23 changes: 18 additions & 5 deletions include/net/netfilter/nf_conntrack.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ union nf_conntrack_expect_proto {

#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/timer.h>

#ifdef CONFIG_NETFILTER_DEBUG
#define NF_CT_ASSERT(x) WARN_ON(!(x))
Expand Down Expand Up @@ -73,7 +72,7 @@ struct nf_conn_help {
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>

struct nf_conn {
/* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
/* Usage count in here is 1 for hash table, 1 per skb,
* plus 1 for any connection(s) we are `master' for
*
* Hint, SKB address this struct and refcnt via skb->nfct and
Expand All @@ -96,8 +95,8 @@ struct nf_conn {
/* Have we seen traffic both ways yet? (bitset) */
unsigned long status;

/* Timer function; drops refcnt when it goes off. */
struct timer_list timeout;
/* jiffies32 when this ct is considered dead */
u32 timeout;

possible_net_t ct_net;

Expand Down Expand Up @@ -291,14 +290,28 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb)
return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK;
}

#define nfct_time_stamp ((u32)(jiffies))

/* jiffies until ct expires, 0 if already expired */
static inline unsigned long nf_ct_expires(const struct nf_conn *ct)
{
long timeout = (long)ct->timeout.expires - (long)jiffies;
s32 timeout = ct->timeout - nfct_time_stamp;

return timeout > 0 ? timeout : 0;
}

static inline bool nf_ct_is_expired(const struct nf_conn *ct)
{
return (__s32)(ct->timeout - nfct_time_stamp) <= 0;
}

/* use after obtaining a reference count */
static inline bool nf_ct_should_gc(const struct nf_conn *ct)
{
return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) &&
!nf_ct_is_dying(ct);
}

struct kernel_param;

int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
Expand Down
91 changes: 50 additions & 41 deletions net/netfilter/nf_conntrack_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,6 @@ destroy_conntrack(struct nf_conntrack *nfct)

pr_debug("destroy_conntrack(%p)\n", ct);
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));

if (unlikely(nf_ct_is_template(ct))) {
nf_ct_tmpl_free(ct);
Expand Down Expand Up @@ -434,35 +433,30 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
struct nf_conn_tstamp *tstamp;

if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
return false;

tstamp = nf_conn_tstamp_find(ct);
if (tstamp && tstamp->stop == 0)
tstamp->stop = ktime_get_real_ns();

if (nf_ct_is_dying(ct))
goto delete;

if (nf_conntrack_event_report(IPCT_DESTROY, ct,
portid, report) < 0) {
/* destroy event was not delivered */
/* destroy event was not delivered. nf_ct_put will
* be done by event cache worker on redelivery.
*/
nf_ct_delete_from_lists(ct);
nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
return false;
}

nf_conntrack_ecache_work(nf_ct_net(ct));
set_bit(IPS_DYING_BIT, &ct->status);
delete:
nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
return true;
}
EXPORT_SYMBOL_GPL(nf_ct_delete);

static void death_by_timeout(unsigned long ul_conntrack)
{
nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
}

static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
const struct nf_conntrack_tuple *tuple,
Expand All @@ -480,6 +474,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
net_eq(net, nf_ct_net(ct));
}

/* caller must hold rcu readlock and none of the nf_conntrack_locks */
static void nf_ct_gc_expired(struct nf_conn *ct)
{
if (!atomic_inc_not_zero(&ct->ct_general.use))
return;

if (nf_ct_should_gc(ct))
nf_ct_kill(ct);

nf_ct_put(ct);
}

/*
* Warning :
* - Caller must take a reference on returned object
Expand All @@ -499,6 +505,17 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
bucket = reciprocal_scale(hash, hsize);

hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
struct nf_conn *ct;

ct = nf_ct_tuplehash_to_ctrack(h);
if (nf_ct_is_expired(ct)) {
nf_ct_gc_expired(ct);
continue;
}

if (nf_ct_is_dying(ct))
continue;

if (nf_ct_key_equal(h, tuple, zone, net)) {
NF_CT_STAT_INC_ATOMIC(net, found);
return h;
Expand Down Expand Up @@ -597,7 +614,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
zone, net))
goto out;

add_timer(&ct->timeout);
smp_wmb();
/* The caller holds a reference to this object */
atomic_set(&ct->ct_general.use, 2);
Expand Down Expand Up @@ -750,8 +766,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout.expires += jiffies;
add_timer(&ct->timeout);
ct->timeout += nfct_time_stamp;
atomic_inc(&ct->ct_general.use);
ct->status |= IPS_CONFIRMED;

Expand Down Expand Up @@ -815,8 +830,16 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,

hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (ct != ignored_conntrack &&
nf_ct_key_equal(h, tuple, zone, net)) {

if (ct == ignored_conntrack)
continue;

if (nf_ct_is_expired(ct)) {
nf_ct_gc_expired(ct);
continue;
}

if (nf_ct_key_equal(h, tuple, zone, net)) {
NF_CT_STAT_INC_ATOMIC(net, found);
rcu_read_unlock();
return 1;
Expand Down Expand Up @@ -850,6 +873,11 @@ static unsigned int early_drop_list(struct net *net,
hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);

if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
continue;
}

if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
!net_eq(nf_ct_net(tmp), net) ||
nf_ct_is_dying(tmp))
Expand All @@ -867,7 +895,6 @@ static unsigned int early_drop_list(struct net *net,
*/
if (net_eq(nf_ct_net(tmp), net) &&
nf_ct_is_confirmed(tmp) &&
del_timer(&tmp->timeout) &&
nf_ct_delete(tmp, 0, 0))
drops++;

Expand Down Expand Up @@ -937,8 +964,6 @@ __nf_conntrack_alloc(struct net *net,
/* save hash for reusing when confirming */
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
ct->status = 0;
/* Don't set timer yet: wait for confirmation */
setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
write_pnet(&ct->ct_net, net);
memset(&ct->__nfct_init_offset[0], 0,
offsetof(struct nf_conn, proto) -
Expand Down Expand Up @@ -1312,26 +1337,17 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
unsigned long extra_jiffies,
int do_acct)
{
NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
NF_CT_ASSERT(skb);

/* Only update if this is not a fixed timeout */
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
goto acct;

/* If not in hash table, timer will not be active yet */
if (!nf_ct_is_confirmed(ct)) {
ct->timeout.expires = extra_jiffies;
} else {
unsigned long newtime = jiffies + extra_jiffies;

/* Only update the timeout if the new timeout is at least
HZ jiffies from the old timeout. Need del_timer for race
avoidance (may already be dying). */
if (newtime - ct->timeout.expires >= HZ)
mod_timer_pending(&ct->timeout, newtime);
}
if (nf_ct_is_confirmed(ct))
extra_jiffies += nfct_time_stamp;

ct->timeout = extra_jiffies;
acct:
if (do_acct)
nf_ct_acct_update(ct, ctinfo, skb->len);
Expand All @@ -1346,11 +1362,7 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
if (do_acct)
nf_ct_acct_update(ct, ctinfo, skb->len);

if (del_timer(&ct->timeout)) {
ct->timeout.function((unsigned long)ct);
return true;
}
return false;
return nf_ct_delete(ct, 0, 0);
}
EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);

Expand Down Expand Up @@ -1485,11 +1497,8 @@ void nf_ct_iterate_cleanup(struct net *net,

while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
if (del_timer(&ct->timeout))
nf_ct_delete(ct, portid, report);

/* ... else the timer will get him soon. */

nf_ct_delete(ct, portid, report);
nf_ct_put(ct);
cond_resched();
}
Expand Down
14 changes: 5 additions & 9 deletions net/netfilter/nf_conntrack_netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -1144,9 +1144,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
}
}

if (del_timer(&ct->timeout))
nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));

nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
nf_ct_put(ct);

return 0;
Expand Down Expand Up @@ -1514,11 +1512,10 @@ static int ctnetlink_change_timeout(struct nf_conn *ct,
{
u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));

if (!del_timer(&ct->timeout))
return -ETIME;
ct->timeout = nfct_time_stamp + timeout * HZ;

ct->timeout.expires = jiffies + timeout * HZ;
add_timer(&ct->timeout);
if (test_bit(IPS_DYING_BIT, &ct->status))
return -ETIME;

return 0;
}
Expand Down Expand Up @@ -1716,9 +1713,8 @@ ctnetlink_create_conntrack(struct net *net,

if (!cda[CTA_TIMEOUT])
goto err1;
ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));

ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;

rcu_read_lock();
if (cda[CTA_HELP]) {
Expand Down
3 changes: 1 addition & 2 deletions net/netfilter/nf_conntrack_pptp.c
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
pr_debug("setting timeout of conntrack %p to 0\n", sibling);
sibling->proto.gre.timeout = 0;
sibling->proto.gre.stream_timeout = 0;
if (del_timer(&sibling->timeout))
sibling->timeout.function((unsigned long)sibling);
nf_ct_kill(sibling);
nf_ct_put(sibling);
return 1;
} else {
Expand Down
6 changes: 0 additions & 6 deletions net/netfilter/nf_nat_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -565,16 +565,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
* will delete entry from already-freed table.
*/
if (!del_timer(&ct->timeout))
return 1;

ct->status &= ~IPS_NAT_DONE_MASK;

rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
nf_nat_bysource_params);

add_timer(&ct->timeout);

/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
*/
Expand Down

0 comments on commit f330a7f

Please sign in to comment.