Skip to content

Commit

Permalink
netfilter: iptables: lock free counters
Browse files Browse the repository at this point in the history
The reader/writer lock in ip_tables is acquired in the critical path of
processing packets and is one of the reasons just loading iptables can cause
a 20% performance loss. The rwlock serves two functions:

1) it prevents changes to table state (xt_replace) while table is in use.
   This is now handled by doing rcu on the xt_table. When table is
   replaced, the new table(s) are put in and the old one table(s) are freed
   after RCU period.

2) it provides synchronization when accesing the counter values.
   This is now handled by swapping in new table_info entries for each cpu
   then summing the old values, and putting the result back onto one
   cpu.  On a busy system it may cause sampling to occur at different
   times on each cpu, but no packet/byte counts are lost in the process.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)
BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)

Acked-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
  • Loading branch information
Stephen Hemminger authored and Patrick McHardy committed Feb 20, 2009
1 parent 323dbf9 commit 7845447
Show file tree
Hide file tree
Showing 5 changed files with 284 additions and 102 deletions.
6 changes: 4 additions & 2 deletions include/linux/netfilter/x_tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ struct xt_table
unsigned int valid_hooks;

/* Lock for the curtain */
rwlock_t lock;
struct mutex lock;

/* Man behind the curtain... */
struct xt_table_info *private;
Expand Down Expand Up @@ -385,7 +385,7 @@ struct xt_table_info

/* ipt_entry tables: one per CPU */
/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
char *entries[1];
void *entries[1];
};

#define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
Expand Down Expand Up @@ -432,6 +432,8 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);

extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
extern void xt_free_table_info(struct xt_table_info *info);
extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
struct xt_table_info *new);

#ifdef CONFIG_COMPAT
#include <net/compat.h>
Expand Down
115 changes: 88 additions & 27 deletions net/ipv4/netfilter/arp_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname;

read_lock_bh(&table->lock);
private = table->private;
table_base = (void *)private->entries[smp_processor_id()];
rcu_read_lock();
private = rcu_dereference(table->private);
table_base = rcu_dereference(private->entries[smp_processor_id()]);

e = get_entry(table_base, private->hook_entry[hook]);
back = get_entry(table_base, private->underflow[hook]);

Expand Down Expand Up @@ -335,7 +336,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
e = (void *)e + e->next_offset;
}
} while (!hotdrop);
read_unlock_bh(&table->lock);

rcu_read_unlock();

if (hotdrop)
return NF_DROP;
Expand Down Expand Up @@ -738,11 +740,65 @@ static void get_counters(const struct xt_table_info *t,
}
}

static inline struct xt_counters *alloc_counters(struct xt_table *table)

/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
static int
add_counter_to_entry(struct arpt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

(*i)++;
return 0;
}

/* Take values from counters and add them back onto the current cpu */
static void put_counters(struct xt_table_info *t,
const struct xt_counters counters[])
{
unsigned int i, cpu;

local_bh_disable();
cpu = smp_processor_id();
i = 0;
ARPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_counter_to_entry,
counters,
&i);
local_bh_enable();
}

static inline int
zero_entry_counter(struct arpt_entry *e, void *arg)
{
e->counters.bcnt = 0;
e->counters.pcnt = 0;
return 0;
}

static void
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
{
unsigned int cpu;
const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];

memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
for_each_possible_cpu(cpu) {
memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
zero_entry_counter, NULL);
}
}

static struct xt_counters *alloc_counters(struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
const struct xt_table_info *private = table->private;
struct xt_table_info *private = table->private;
struct xt_table_info *info;

/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
Expand All @@ -752,14 +808,30 @@ static inline struct xt_counters *alloc_counters(struct xt_table *table)
counters = vmalloc_node(countersize, numa_node_id());

if (counters == NULL)
return ERR_PTR(-ENOMEM);
goto nomem;

info = xt_alloc_table_info(private->size);
if (!info)
goto free_counters;

/* First, sum counters... */
write_lock_bh(&table->lock);
get_counters(private, counters);
write_unlock_bh(&table->lock);
clone_counters(info, private);

mutex_lock(&table->lock);
xt_table_entry_swap_rcu(private, info);
synchronize_net(); /* Wait until smoke has cleared */

get_counters(info, counters);
put_counters(private, counters);
mutex_unlock(&table->lock);

xt_free_table_info(info);

return counters;

free_counters:
vfree(counters);
nomem:
return ERR_PTR(-ENOMEM);
}

static int copy_entries_to_user(unsigned int total_size,
Expand Down Expand Up @@ -1099,20 +1171,6 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
return ret;
}

/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK.
*/
static inline int add_counter_to_entry(struct arpt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{

ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

(*i)++;
return 0;
}

static int do_add_counters(struct net *net, void __user *user, unsigned int len,
int compat)
{
Expand Down Expand Up @@ -1172,13 +1230,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
goto free;
}

write_lock_bh(&t->lock);
mutex_lock(&t->lock);
private = t->private;
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}

preempt_disable();
i = 0;
/* Choose the copy that is on our node */
loc_cpu_entry = private->entries[smp_processor_id()];
Expand All @@ -1187,8 +1246,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
add_counter_to_entry,
paddc,
&i);
preempt_enable();
unlock_up_free:
write_unlock_bh(&t->lock);
mutex_unlock(&t->lock);

xt_table_unlock(t);
module_put(t->me);
free:
Expand Down
120 changes: 87 additions & 33 deletions net/ipv4/netfilter/ip_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
mtpar.family = tgpar.family = NFPROTO_IPV4;
tgpar.hooknum = hook;

read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
private = table->private;
table_base = (void *)private->entries[smp_processor_id()];

rcu_read_lock();
private = rcu_dereference(table->private);
table_base = rcu_dereference(private->entries[smp_processor_id()]);

e = get_entry(table_base, private->hook_entry[hook]);

/* For return from builtin chain */
Expand Down Expand Up @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
}
} while (!hotdrop);

read_unlock_bh(&table->lock);
rcu_read_unlock();

#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
Expand Down Expand Up @@ -924,13 +926,68 @@ get_counters(const struct xt_table_info *t,
counters,
&i);
}

}

/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
static int
add_counter_to_entry(struct ipt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

(*i)++;
return 0;
}

/* Take values from counters and add them back onto the current cpu */
static void put_counters(struct xt_table_info *t,
const struct xt_counters counters[])
{
unsigned int i, cpu;

local_bh_disable();
cpu = smp_processor_id();
i = 0;
IPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_counter_to_entry,
counters,
&i);
local_bh_enable();
}


static inline int
zero_entry_counter(struct ipt_entry *e, void *arg)
{
e->counters.bcnt = 0;
e->counters.pcnt = 0;
return 0;
}

static void
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
{
unsigned int cpu;
const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];

memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
for_each_possible_cpu(cpu) {
memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
zero_entry_counter, NULL);
}
}

static struct xt_counters * alloc_counters(struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
const struct xt_table_info *private = table->private;
struct xt_table_info *private = table->private;
struct xt_table_info *info;

/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
Expand All @@ -939,14 +996,30 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
counters = vmalloc_node(countersize, numa_node_id());

if (counters == NULL)
return ERR_PTR(-ENOMEM);
goto nomem;

/* First, sum counters... */
write_lock_bh(&table->lock);
get_counters(private, counters);
write_unlock_bh(&table->lock);
info = xt_alloc_table_info(private->size);
if (!info)
goto free_counters;

clone_counters(info, private);

mutex_lock(&table->lock);
xt_table_entry_swap_rcu(private, info);
synchronize_net(); /* Wait until smoke has cleared */

get_counters(info, counters);
put_counters(private, counters);
mutex_unlock(&table->lock);

xt_free_table_info(info);

return counters;

free_counters:
vfree(counters);
nomem:
return ERR_PTR(-ENOMEM);
}

static int
Expand Down Expand Up @@ -1312,27 +1385,6 @@ do_replace(struct net *net, void __user *user, unsigned int len)
return ret;
}

/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
static int
add_counter_to_entry(struct ipt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{
#if 0
duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
*i,
(long unsigned int)e->counters.pcnt,
(long unsigned int)e->counters.bcnt,
(long unsigned int)addme[*i].pcnt,
(long unsigned int)addme[*i].bcnt);
#endif

ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

(*i)++;
return 0;
}

static int
do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
Expand Down Expand Up @@ -1393,13 +1445,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
goto free;
}

write_lock_bh(&t->lock);
mutex_lock(&t->lock);
private = t->private;
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}

preempt_disable();
i = 0;
/* Choose the copy that is on our node */
loc_cpu_entry = private->entries[raw_smp_processor_id()];
Expand All @@ -1408,8 +1461,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
add_counter_to_entry,
paddc,
&i);
preempt_enable();
unlock_up_free:
write_unlock_bh(&t->lock);
mutex_unlock(&t->lock);
xt_table_unlock(t);
module_put(t->me);
free:
Expand Down
Loading

0 comments on commit 7845447

Please sign in to comment.