Skip to content

Commit

Permalink
netfilter: nf_tables: implement proper set selection
Browse files Browse the repository at this point in the history
The current set selection simply choses the first set type that provides
the requested features, which always results in the rbtree being chosen
by virtue of being the first set in the list.

What we actually want to do is choose the implementation that can provide
the requested features and is optimal from either a performance or memory
perspective depending on the characteristics of the elements and the
preferences specified by the user.

The elements are not known when creating a set. Even if we would provide
them for anonymous (literal) sets, we'd still have standalone sets where
the elements are not known in advance. We therefore need an abstract
description of the data charcteristics.

The kernel already knows the size of the key, this patch starts by
introducing a nested set description which so far contains only the maximum
amount of elements. Based on this the set implementations are changed to
provide an estimate of the required amount of memory and the lookup
complexity class.

The set ops have a new callback ->estimate() that is invoked during set
selection. It receives a structure containing the attributes known to the
kernel and is supposed to populate a struct nft_set_estimate with the
complexity class and, in case the size is known, the complete amount of
memory required, or the amount of memory required per element otherwise.

Based on the policy specified by the user (performance/memory, defaulting
to performance) the kernel will then select the best suited implementation.

Even if the set implementation would allow to add more than the specified
maximum amount of elements, they are enforced since new implementations
might not be able to add more than maximum based on which they were
selected.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
  • Loading branch information
Patrick McHardy authored and Pablo Neira Ayuso committed Apr 2, 2014
1 parent fe92ca4 commit c50b960
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 18 deletions.
46 changes: 46 additions & 0 deletions include/net/netfilter/nf_tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,44 @@ struct nft_set_iter {
const struct nft_set_elem *elem);
};

/**
* struct nft_set_desc - description of set elements
*
* @klen: key length
* @dlen: data length
* @size: number of set elements
*/
struct nft_set_desc {
unsigned int klen;
unsigned int dlen;
unsigned int size;
};

/**
* enum nft_set_class - performance class
*
* @NFT_LOOKUP_O_1: constant, O(1)
* @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N)
* @NFT_LOOKUP_O_N: linear, O(N)
*/
enum nft_set_class {
NFT_SET_CLASS_O_1,
NFT_SET_CLASS_O_LOG_N,
NFT_SET_CLASS_O_N,
};

/**
* struct nft_set_estimate - estimation of memory and performance
* characteristics
*
* @size: required memory
* @class: lookup performance class
*/
struct nft_set_estimate {
unsigned int size;
enum nft_set_class class;
};

/**
* struct nft_set_ops - nf_tables set operations
*
Expand Down Expand Up @@ -174,7 +212,11 @@ struct nft_set_ops {
struct nft_set_iter *iter);

unsigned int (*privsize)(const struct nlattr * const nla[]);
bool (*estimate)(const struct nft_set_desc *desc,
u32 features,
struct nft_set_estimate *est);
int (*init)(const struct nft_set *set,
const struct nft_set_desc *desc,
const struct nlattr * const nla[]);
void (*destroy)(const struct nft_set *set);

Expand All @@ -194,6 +236,8 @@ void nft_unregister_set(struct nft_set_ops *ops);
* @name: name of the set
* @ktype: key type (numeric type defined by userspace, not used in the kernel)
* @dtype: data type (verdict or numeric type defined by userspace)
* @size: maximum set size
* @nelems: number of elements
* @ops: set ops
* @flags: set flags
* @klen: key length
Expand All @@ -206,6 +250,8 @@ struct nft_set {
char name[IFNAMSIZ];
u32 ktype;
u32 dtype;
u32 size;
u32 nelems;
/* runtime data below here */
const struct nft_set_ops *ops ____cacheline_aligned;
u16 flags;
Expand Down
27 changes: 27 additions & 0 deletions include/uapi/linux/netfilter/nf_tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,29 @@ enum nft_set_flags {
NFT_SET_MAP = 0x8,
};

/**
* enum nft_set_policies - set selection policy
*
* @NFT_SET_POL_PERFORMANCE: prefer high performance over low memory use
* @NFT_SET_POL_MEMORY: prefer low memory use over high performance
*/
enum nft_set_policies {
NFT_SET_POL_PERFORMANCE,
NFT_SET_POL_MEMORY,
};

/**
* enum nft_set_desc_attributes - set element description
*
* @NFTA_SET_DESC_SIZE: number of elements in set (NLA_U32)
*/
enum nft_set_desc_attributes {
NFTA_SET_DESC_UNSPEC,
NFTA_SET_DESC_SIZE,
__NFTA_SET_DESC_MAX
};
#define NFTA_SET_DESC_MAX (__NFTA_SET_DESC_MAX - 1)

/**
* enum nft_set_attributes - nf_tables set netlink attributes
*
Expand All @@ -221,6 +244,8 @@ enum nft_set_flags {
* @NFTA_SET_KEY_LEN: key data length (NLA_U32)
* @NFTA_SET_DATA_TYPE: mapping data type (NLA_U32)
* @NFTA_SET_DATA_LEN: mapping data length (NLA_U32)
* @NFTA_SET_POLICY: selection policy (NLA_U32)
* @NFTA_SET_DESC: set description (NLA_NESTED)
*/
enum nft_set_attributes {
NFTA_SET_UNSPEC,
Expand All @@ -231,6 +256,8 @@ enum nft_set_attributes {
NFTA_SET_KEY_LEN,
NFTA_SET_DATA_TYPE,
NFTA_SET_DATA_LEN,
NFTA_SET_POLICY,
NFTA_SET_DESC,
__NFTA_SET_MAX
};
#define NFTA_SET_MAX (__NFTA_SET_MAX - 1)
Expand Down
121 changes: 105 additions & 16 deletions net/netfilter/nf_tables_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -1912,9 +1912,18 @@ void nft_unregister_set(struct nft_set_ops *ops)
}
EXPORT_SYMBOL_GPL(nft_unregister_set);

static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const nla[])
/*
* Select a set implementation based on the data characteristics and the
* given policy. The total memory use might not be known if no size is
* given, in that case the amount of memory per element is used.
*/
static const struct nft_set_ops *
nft_select_set_ops(const struct nlattr * const nla[],
const struct nft_set_desc *desc,
enum nft_set_policies policy)
{
const struct nft_set_ops *ops;
const struct nft_set_ops *ops, *bops;
struct nft_set_estimate est, best;
u32 features;

#ifdef CONFIG_MODULES
Expand All @@ -1932,15 +1941,45 @@ static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const
features &= NFT_SET_INTERVAL | NFT_SET_MAP;
}

// FIXME: implement selection properly
bops = NULL;
best.size = ~0;
best.class = ~0;

list_for_each_entry(ops, &nf_tables_set_ops, list) {
if ((ops->features & features) != features)
continue;
if (!ops->estimate(desc, features, &est))
continue;

switch (policy) {
case NFT_SET_POL_PERFORMANCE:
if (est.class < best.class)
break;
if (est.class == best.class && est.size < best.size)
break;
continue;
case NFT_SET_POL_MEMORY:
if (est.size < best.size)
break;
if (est.size == best.size && est.class < best.class)
break;
continue;
default:
break;
}

if (!try_module_get(ops->owner))
continue;
return ops;
if (bops != NULL)
module_put(bops->owner);

bops = ops;
best = est;
}

if (bops != NULL)
return bops;

return ERR_PTR(-EOPNOTSUPP);
}

Expand All @@ -1952,6 +1991,12 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
[NFTA_SET_DATA_TYPE] = { .type = NLA_U32 },
[NFTA_SET_DATA_LEN] = { .type = NLA_U32 },
[NFTA_SET_POLICY] = { .type = NLA_U32 },
[NFTA_SET_DESC] = { .type = NLA_NESTED },
};

static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
};

static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
Expand Down Expand Up @@ -2043,6 +2088,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
{
struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct nlattr *desc;
u32 portid = NETLINK_CB(ctx->skb).portid;
u32 seq = ctx->nlh->nlmsg_seq;

Expand Down Expand Up @@ -2076,6 +2122,14 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
goto nla_put_failure;
}

desc = nla_nest_start(skb, NFTA_SET_DESC);
if (desc == NULL)
goto nla_put_failure;
if (set->size &&
nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
goto nla_put_failure;
nla_nest_end(skb, desc);

return nlmsg_end(skb, nlh);

nla_put_failure:
Expand Down Expand Up @@ -2304,6 +2358,23 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
return err;
}

static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
struct nft_set_desc *desc,
const struct nlattr *nla)
{
struct nlattr *da[NFTA_SET_DESC_MAX + 1];
int err;

err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy);
if (err < 0)
return err;

if (da[NFTA_SET_DESC_SIZE] != NULL)
desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE]));

return 0;
}

static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
Expand All @@ -2318,23 +2389,26 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
char name[IFNAMSIZ];
unsigned int size;
bool create;
u32 ktype, klen, dlen, dtype, flags;
u32 ktype, dtype, flags, policy;
struct nft_set_desc desc;
int err;

if (nla[NFTA_SET_TABLE] == NULL ||
nla[NFTA_SET_NAME] == NULL ||
nla[NFTA_SET_KEY_LEN] == NULL)
return -EINVAL;

memset(&desc, 0, sizeof(desc));

ktype = NFT_DATA_VALUE;
if (nla[NFTA_SET_KEY_TYPE] != NULL) {
ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
return -EINVAL;
}

klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
if (klen == 0 || klen > FIELD_SIZEOF(struct nft_data, data))
desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data))
return -EINVAL;

flags = 0;
Expand All @@ -2346,7 +2420,6 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
}

dtype = 0;
dlen = 0;
if (nla[NFTA_SET_DATA_TYPE] != NULL) {
if (!(flags & NFT_SET_MAP))
return -EINVAL;
Expand All @@ -2359,15 +2432,25 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
if (dtype != NFT_DATA_VERDICT) {
if (nla[NFTA_SET_DATA_LEN] == NULL)
return -EINVAL;
dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
if (dlen == 0 ||
dlen > FIELD_SIZEOF(struct nft_data, data))
desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
if (desc.dlen == 0 ||
desc.dlen > FIELD_SIZEOF(struct nft_data, data))
return -EINVAL;
} else
dlen = sizeof(struct nft_data);
desc.dlen = sizeof(struct nft_data);
} else if (flags & NFT_SET_MAP)
return -EINVAL;

policy = NFT_SET_POL_PERFORMANCE;
if (nla[NFTA_SET_POLICY] != NULL)
policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));

if (nla[NFTA_SET_DESC] != NULL) {
err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]);
if (err < 0)
return err;
}

create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;

afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
Expand Down Expand Up @@ -2398,7 +2481,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
if (!(nlh->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;

ops = nft_select_set_ops(nla);
ops = nft_select_set_ops(nla, &desc, policy);
if (IS_ERR(ops))
return PTR_ERR(ops);

Expand All @@ -2419,12 +2502,13 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
INIT_LIST_HEAD(&set->bindings);
set->ops = ops;
set->ktype = ktype;
set->klen = klen;
set->klen = desc.klen;
set->dtype = dtype;
set->dlen = dlen;
set->dlen = desc.dlen;
set->flags = flags;
set->size = desc.size;

err = ops->init(set, nla);
err = ops->init(set, &desc, nla);
if (err < 0)
goto err2;

Expand Down Expand Up @@ -2733,6 +2817,9 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set,
enum nft_registers dreg;
int err;

if (set->size && set->nelems == set->size)
return -ENFILE;

err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
nft_set_elem_policy);
if (err < 0)
Expand Down Expand Up @@ -2798,6 +2885,7 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set,
err = set->ops->insert(set, &elem);
if (err < 0)
goto err3;
set->nelems++;

return 0;

Expand Down Expand Up @@ -2867,6 +2955,7 @@ static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set,
goto err2;

set->ops->remove(set, &elem);
set->nelems--;

nft_data_uninit(&elem.key, NFT_DATA_VALUE);
if (set->flags & NFT_SET_MAP)
Expand Down
Loading

0 comments on commit c50b960

Please sign in to comment.