Skip to content

Commit

Permalink
netfilter: ecache: use dedicated list for event redelivery
Browse files Browse the repository at this point in the history
This disentangles event redelivery and the percpu dying list.

Because entries are now stored on a dedicated list, all
entries are in NFCT_ECACHE_DESTROY_FAIL state and all entries
still have confirmed bit set -- the reference count is at least 1.

The 'struct net' back-pointer can be removed as well.

The pcpu dying list will be removed eventually, it has no functionality.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
  • Loading branch information
Florian Westphal authored and Pablo Neira Ayuso committed May 13, 2022
1 parent a997157 commit 2ed3bf1
Showing 4 changed files with 82 additions and 73 deletions.
3 changes: 2 additions & 1 deletion include/net/netfilter/nf_conntrack.h
Original file line number Diff line number Diff line change
@@ -45,7 +45,8 @@ union nf_conntrack_expect_proto {

struct nf_conntrack_net_ecache {
struct delayed_work dwork;
struct netns_ct *ct_net;
spinlock_t dying_lock;
struct hlist_nulls_head dying_list;
};

struct nf_conntrack_net {
2 changes: 0 additions & 2 deletions include/net/netfilter/nf_conntrack_ecache.h
Original file line number Diff line number Diff line change
@@ -14,7 +14,6 @@
#include <net/netfilter/nf_conntrack_extend.h>

enum nf_ct_ecache_state {
NFCT_ECACHE_UNKNOWN, /* destroy event not sent */
NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */
NFCT_ECACHE_DESTROY_SENT, /* sent destroy event after failure */
};
@@ -23,7 +22,6 @@ struct nf_conntrack_ecache {
unsigned long cache; /* bitops want long */
u16 ctmask; /* bitmask of ct events to be delivered */
u16 expmask; /* bitmask of expect events to be delivered */
enum nf_ct_ecache_state state:8;/* ecache state */
u32 missed; /* missed events */
u32 portid; /* netlink portid of destroyer */
};
33 changes: 28 additions & 5 deletions net/netfilter/nf_conntrack_core.c
Original file line number Diff line number Diff line change
@@ -660,15 +660,12 @@ void nf_ct_destroy(struct nf_conntrack *nfct)
}
EXPORT_SYMBOL(nf_ct_destroy);

static void nf_ct_delete_from_lists(struct nf_conn *ct)
static void __nf_ct_delete_from_lists(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
unsigned int sequence;

nf_ct_helper_destroy(ct);

local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
@@ -681,12 +678,33 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)

clean_from_lists(ct);
nf_conntrack_double_unlock(hash, reply_hash);
}

static void nf_ct_delete_from_lists(struct nf_conn *ct)
{
nf_ct_helper_destroy(ct);
local_bh_disable();

__nf_ct_delete_from_lists(ct);
nf_ct_add_to_dying_list(ct);

local_bh_enable();
}

static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct));

spin_lock(&cnet->ecache.dying_lock);
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
&cnet->ecache.dying_list);
spin_unlock(&cnet->ecache.dying_lock);
#else
nf_ct_add_to_dying_list(ct);
#endif
}

bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
struct nf_conn_tstamp *tstamp;
@@ -709,7 +727,12 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
/* destroy event was not delivered. nf_ct_put will
* be done by event cache worker on redelivery.
*/
nf_ct_delete_from_lists(ct);
nf_ct_helper_destroy(ct);
local_bh_disable();
__nf_ct_delete_from_lists(ct);
nf_ct_add_to_ecache_list(ct);
local_bh_enable();

nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
return false;
}
117 changes: 52 additions & 65 deletions net/netfilter/nf_conntrack_ecache.c
Original file line number Diff line number Diff line change
@@ -16,7 +16,6 @@
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
@@ -29,103 +28,89 @@

static DEFINE_MUTEX(nf_ct_ecache_mutex);

#define ECACHE_RETRY_WAIT (HZ/10)
#define ECACHE_STACK_ALLOC (256 / sizeof(void *))
#define DYING_NULLS_VAL ((1 << 30) + 1)
#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10)
#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10)

enum retry_state {
STATE_CONGESTED,
STATE_RESTART,
STATE_DONE,
};

static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)
{
struct nf_conn *refs[ECACHE_STACK_ALLOC];
unsigned long stop = jiffies + ECACHE_MAX_JIFFIES;
struct hlist_nulls_head evicted_list;
enum retry_state ret = STATE_DONE;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
unsigned int evicted = 0;
unsigned int sent;

spin_lock(&pcpu->lock);
INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL);

hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
next:
sent = 0;
spin_lock_bh(&cnet->ecache.dying_lock);

hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) {
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
struct nf_conntrack_ecache *e;

if (!nf_ct_is_confirmed(ct))
continue;

/* This ecache access is safe because the ct is on the
* pcpu dying list and we hold the spinlock -- the entry
* cannot be free'd until after the lock is released.
*
* This is true even if ct has a refcount of 0: the
* cpu that is about to free the entry must remove it
* from the dying list and needs the lock to do so.
*/
e = nf_ct_ecache_find(ct);
if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
continue;

/* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means
* the worker owns this entry: the ct will remain valid
* until the worker puts its ct reference.
/* The worker owns all entries, ct remains valid until nf_ct_put
* in the loop below.
*/
if (nf_conntrack_event(IPCT_DESTROY, ct)) {
ret = STATE_CONGESTED;
break;
}

e->state = NFCT_ECACHE_DESTROY_SENT;
refs[evicted] = ct;
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list);

if (++evicted >= ARRAY_SIZE(refs)) {
if (time_after(stop, jiffies)) {
ret = STATE_RESTART;
break;
}

if (sent++ > 16) {
spin_unlock_bh(&cnet->ecache.dying_lock);
cond_resched();
goto next;
}
}

spin_unlock(&pcpu->lock);
spin_unlock_bh(&cnet->ecache.dying_lock);

/* can't _put while holding lock */
while (evicted)
nf_ct_put(refs[--evicted]);
hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) {
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);

hlist_nulls_add_fake(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
nf_ct_put(ct);

cond_resched();
}

return ret;
}

static void ecache_work(struct work_struct *work)
{
struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work);
struct netns_ct *ctnet = cnet->ecache.ct_net;
int cpu, delay = -1;
struct ct_pcpu *pcpu;

local_bh_disable();

for_each_possible_cpu(cpu) {
enum retry_state ret;

pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu);

ret = ecache_work_evict_list(pcpu);

switch (ret) {
case STATE_CONGESTED:
delay = ECACHE_RETRY_WAIT;
goto out;
case STATE_RESTART:
delay = 0;
break;
case STATE_DONE:
break;
}
int ret, delay = -1;

ret = ecache_work_evict_list(cnet);
switch (ret) {
case STATE_CONGESTED:
delay = ECACHE_RETRY_JIFFIES;
break;
case STATE_RESTART:
delay = 0;
break;
case STATE_DONE:
break;
}

out:
local_bh_enable();

ctnet->ecache_dwork_pending = delay > 0;
if (delay >= 0)
schedule_delayed_work(&cnet->ecache.dwork, delay);
}
@@ -199,7 +184,6 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
*/
if (e->portid == 0 && portid != 0)
e->portid = portid;
e->state = NFCT_ECACHE_DESTROY_FAIL;
}

return ret;
@@ -297,8 +281,10 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
schedule_delayed_work(&cnet->ecache.dwork, HZ);
net->ct.ecache_dwork_pending = true;
} else if (state == NFCT_ECACHE_DESTROY_SENT) {
net->ct.ecache_dwork_pending = false;
mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
if (!hlist_nulls_empty(&cnet->ecache.dying_list))
mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
else
net->ct.ecache_dwork_pending = false;
}
}

@@ -311,8 +297,9 @@ void nf_conntrack_ecache_pernet_init(struct net *net)

net->ct.sysctl_events = nf_ct_events;

cnet->ecache.ct_net = &net->ct;
INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work);
INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL);
spin_lock_init(&cnet->ecache.dying_lock);

BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */
}

0 comments on commit 2ed3bf1

Please sign in to comment.