Skip to content

Commit

Permalink
Merge branch 'ipv6-Separate-data-structures-for-FIB-and-data-path'
Browse files Browse the repository at this point in the history
David Ahern says:

====================
net/ipv6: Separate data structures for FIB and data path

IPv6 uses the same data struct for both control plane (FIB entries) and
data path (dst entries). This struct has elements needed for both paths
adding memory overhead and complexity (taking a dst hold in most places
but an additional reference on rt6i_ref in a few). Furthermore, because
of the dst_alloc tie, all FIB entries are allocated with GFP_ATOMIC.

This patch set separates FIB entries from dst entries, better aligning
IPv6 code with IPv4, simplifying the reference counting and allowing
FIB entries added by userspace (not autoconf) to use GFP_KERNEL. It is
first step to a number of performance and scalability changes.

The end result of this patch set:
  - FIB entries (fib6_info):
        /* size: 208, cachelines: 4, members: 25 */
        /* sum members: 207, holes: 1, sum holes: 1 */

  - dst entries (rt6_info)
       /* size: 240, cachelines: 4, members: 11 */

Versus the the single rt6_info struct today for both paths:
      /* size: 320, cachelines: 5, members: 28 */

This amounts to a 35% reduction in memory use for FIB entries and a
25% reduction for dst entries.

With respect to locking FIB entries use RCU and a single atomic
counter with fib6_info_hold and fib6_info_release helpers to manage
the reference counting. dst entries use only the traditional dst
refcounts with dst_hold and dst_release.

FIB entries for host routes are referenced by inet6_ifaddr and
ifacaddr6. In both cases, additional holds are taken -- similar to
what is done for devices.

This set is the first of many changes to improve the scalability of the
IPv6 code. Follow on changes include:
- consolidating duplicate fib6_info references like IPv4 does with
  duplicate fib_info

- moving fib6_info into a slab cache to avoid allocation roundups to
  power of 2 (the 208 size becomes a 256 actual allocation)

- Allow FIB lookups without generating a dst (e.g., most rt6_lookup
  users just want to verify the egress device). Means moving dst
  allocation to the other side of fib6_rule_lookup which again aligns
  with IPv4 behavior

- using separate standalone nexthop objects which have performance
  benefits beyond fib_info consolidation

At this point I am not seeing any refcount leaks or underflows, no
oops or bug_ons, or warnings from kasan, so I think it is ready for
others to beat up on it finding errors in code paths I have missed.

v2 changes
- rebased to top of tree
- improved commit message on patch 7

v1 changes
- rebased to top of tree
- fix memory leak of metrics as noted by Ido
- MTU fixes based on pmtu tests (thanks Stefano Brivio for writing)

RFC v2 changes
- improved commit messages
- move common metrics code from dst.c to net/ipv4/metrics.c (comment
  from DaveM)
- address comments from Wei Wang and Martin KaFai Lau (let me know if
  I missed something)
- fixes detected by kernel test robots
  + added fib6_metric_set to change metric on a FIB entry which could
    be pointing to read-only dst_default_metrics
  + 0day testing found a problem with an intermediate patch; added
    dst_hold_safe on rt->from. Code is removed 3 patches later
- allow cacheinfo to handle NULL dst; means only expires is pushed to
  userspace
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Apr 18, 2018
2 parents a2d481b + 77634cc commit 0565de2
Show file tree
Hide file tree
Showing 19 changed files with 1,218 additions and 1,136 deletions.
96 changes: 48 additions & 48 deletions drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ struct mlxsw_sp_fib6_entry {

struct mlxsw_sp_rt6 {
struct list_head list;
struct rt6_info *rt;
struct fib6_info *rt;
};

struct mlxsw_sp_lpm_tree {
Expand Down Expand Up @@ -2770,9 +2770,9 @@ mlxsw_sp_nexthop6_group_cmp(const struct mlxsw_sp_nexthop_group *nh_grp,
struct in6_addr *gw;
int ifindex, weight;

ifindex = mlxsw_sp_rt6->rt->dst.dev->ifindex;
weight = mlxsw_sp_rt6->rt->rt6i_nh_weight;
gw = &mlxsw_sp_rt6->rt->rt6i_gateway;
ifindex = mlxsw_sp_rt6->rt->fib6_nh.nh_dev->ifindex;
weight = mlxsw_sp_rt6->rt->fib6_nh.nh_weight;
gw = &mlxsw_sp_rt6->rt->fib6_nh.nh_gw;
if (!mlxsw_sp_nexthop6_group_has_nexthop(nh_grp, gw, ifindex,
weight))
return false;
Expand Down Expand Up @@ -2838,7 +2838,7 @@ mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed)
struct net_device *dev;

list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
dev = mlxsw_sp_rt6->rt->dst.dev;
dev = mlxsw_sp_rt6->rt->fib6_nh.nh_dev;
val ^= dev->ifindex;
}

Expand Down Expand Up @@ -3834,11 +3834,11 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,

for (i = 0; i < nh_grp->count; i++) {
struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
struct rt6_info *rt = mlxsw_sp_rt6->rt;
struct fib6_info *rt = mlxsw_sp_rt6->rt;

if (nh->rif && nh->rif->dev == rt->dst.dev &&
if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev &&
ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr,
&rt->rt6i_gateway))
&rt->fib6_nh.nh_gw))
return nh;
continue;
}
Expand Down Expand Up @@ -3895,7 +3895,7 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)

if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) {
list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
list)->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
return;
}

Expand All @@ -3905,9 +3905,9 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)

nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
if (nh && nh->offloaded)
mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
mlxsw_sp_rt6->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
else
mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
mlxsw_sp_rt6->rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
}
}

Expand All @@ -3920,9 +3920,9 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry,
common);
list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
struct rt6_info *rt = mlxsw_sp_rt6->rt;
struct fib6_info *rt = mlxsw_sp_rt6->rt;

rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
}
}

Expand Down Expand Up @@ -4699,7 +4699,7 @@ static void mlxsw_sp_router_fib4_del(struct mlxsw_sp *mlxsw_sp,
mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
}

static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt)
static bool mlxsw_sp_fib6_rt_should_ignore(const struct fib6_info *rt)
{
/* Packets with link-local destination IP arriving to the router
* are trapped to the CPU, so no need to program specific routes
Expand All @@ -4721,7 +4721,7 @@ static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt)
return false;
}

static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt)
static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct fib6_info *rt)
{
struct mlxsw_sp_rt6 *mlxsw_sp_rt6;

Expand All @@ -4734,18 +4734,18 @@ static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt)
* memory.
*/
mlxsw_sp_rt6->rt = rt;
rt6_hold(rt);
fib6_info_hold(rt);

return mlxsw_sp_rt6;
}

#if IS_ENABLED(CONFIG_IPV6)
static void mlxsw_sp_rt6_release(struct rt6_info *rt)
static void mlxsw_sp_rt6_release(struct fib6_info *rt)
{
rt6_release(rt);
fib6_info_release(rt);
}
#else
static void mlxsw_sp_rt6_release(struct rt6_info *rt)
static void mlxsw_sp_rt6_release(struct fib6_info *rt)
{
}
#endif
Expand All @@ -4756,13 +4756,13 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
kfree(mlxsw_sp_rt6);
}

static bool mlxsw_sp_fib6_rt_can_mp(const struct rt6_info *rt)
static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
{
/* RTF_CACHE routes are ignored */
return (rt->rt6i_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
}

static struct rt6_info *
static struct fib6_info *
mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
{
return list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
Expand All @@ -4771,15 +4771,15 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)

static struct mlxsw_sp_fib6_entry *
mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
const struct rt6_info *nrt, bool replace)
const struct fib6_info *nrt, bool replace)
{
struct mlxsw_sp_fib6_entry *fib6_entry;

if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace)
return NULL;

list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);

/* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same
* virtual router.
Expand All @@ -4802,7 +4802,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,

static struct mlxsw_sp_rt6 *
mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry,
const struct rt6_info *rt)
const struct fib6_info *rt)
{
struct mlxsw_sp_rt6 *mlxsw_sp_rt6;

Expand All @@ -4815,21 +4815,21 @@ mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry,
}

static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
const struct rt6_info *rt,
const struct fib6_info *rt,
enum mlxsw_sp_ipip_type *ret)
{
return rt->dst.dev &&
mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->dst.dev, ret);
return rt->fib6_nh.nh_dev &&
mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh.nh_dev, ret);
}

static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_nexthop_group *nh_grp,
struct mlxsw_sp_nexthop *nh,
const struct rt6_info *rt)
const struct fib6_info *rt)
{
const struct mlxsw_sp_ipip_ops *ipip_ops;
struct mlxsw_sp_ipip_entry *ipip_entry;
struct net_device *dev = rt->dst.dev;
struct net_device *dev = rt->fib6_nh.nh_dev;
struct mlxsw_sp_rif *rif;
int err;

Expand Down Expand Up @@ -4870,13 +4870,13 @@ static void mlxsw_sp_nexthop6_type_fini(struct mlxsw_sp *mlxsw_sp,
static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_nexthop_group *nh_grp,
struct mlxsw_sp_nexthop *nh,
const struct rt6_info *rt)
const struct fib6_info *rt)
{
struct net_device *dev = rt->dst.dev;
struct net_device *dev = rt->fib6_nh.nh_dev;

nh->nh_grp = nh_grp;
nh->nh_weight = rt->rt6i_nh_weight;
memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr));
nh->nh_weight = rt->fib6_nh.nh_weight;
memcpy(&nh->gw_addr, &rt->fib6_nh.nh_gw, sizeof(nh->gw_addr));
mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);

list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
Expand All @@ -4897,7 +4897,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
}

static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
const struct rt6_info *rt)
const struct fib6_info *rt)
{
return rt->rt6i_flags & RTF_GATEWAY ||
mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL);
Expand Down Expand Up @@ -4928,7 +4928,7 @@ mlxsw_sp_nexthop6_group_create(struct mlxsw_sp *mlxsw_sp,
nh_grp->gateway = mlxsw_sp_rt6_is_gateway(mlxsw_sp, mlxsw_sp_rt6->rt);
nh_grp->count = fib6_entry->nrt6;
for (i = 0; i < nh_grp->count; i++) {
struct rt6_info *rt = mlxsw_sp_rt6->rt;
struct fib6_info *rt = mlxsw_sp_rt6->rt;

nh = &nh_grp->nexthops[i];
err = mlxsw_sp_nexthop6_init(mlxsw_sp, nh_grp, nh, rt);
Expand Down Expand Up @@ -5040,7 +5040,7 @@ mlxsw_sp_nexthop6_group_update(struct mlxsw_sp *mlxsw_sp,
static int
mlxsw_sp_fib6_entry_nexthop_add(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_fib6_entry *fib6_entry,
struct rt6_info *rt)
struct fib6_info *rt)
{
struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
int err;
Expand Down Expand Up @@ -5068,7 +5068,7 @@ mlxsw_sp_fib6_entry_nexthop_add(struct mlxsw_sp *mlxsw_sp,
static void
mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_fib6_entry *fib6_entry,
struct rt6_info *rt)
struct fib6_info *rt)
{
struct mlxsw_sp_rt6 *mlxsw_sp_rt6;

Expand All @@ -5084,7 +5084,7 @@ mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp,

static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_fib_entry *fib_entry,
const struct rt6_info *rt)
const struct fib6_info *rt)
{
/* Packets hitting RTF_REJECT routes need to be discarded by the
* stack. We can rely on their destination device not having a
Expand Down Expand Up @@ -5118,7 +5118,7 @@ mlxsw_sp_fib6_entry_rt_destroy_all(struct mlxsw_sp_fib6_entry *fib6_entry)
static struct mlxsw_sp_fib6_entry *
mlxsw_sp_fib6_entry_create(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_fib_node *fib_node,
struct rt6_info *rt)
struct fib6_info *rt)
{
struct mlxsw_sp_fib6_entry *fib6_entry;
struct mlxsw_sp_fib_entry *fib_entry;
Expand Down Expand Up @@ -5168,12 +5168,12 @@ static void mlxsw_sp_fib6_entry_destroy(struct mlxsw_sp *mlxsw_sp,

static struct mlxsw_sp_fib6_entry *
mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
const struct rt6_info *nrt, bool replace)
const struct fib6_info *nrt, bool replace)
{
struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL;

list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);

if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id)
continue;
Expand All @@ -5198,7 +5198,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
bool replace)
{
struct mlxsw_sp_fib_node *fib_node = new6_entry->common.fib_node;
struct rt6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry);
struct fib6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry);
struct mlxsw_sp_fib6_entry *fib6_entry;

fib6_entry = mlxsw_sp_fib6_node_entry_find(fib_node, nrt, replace);
Expand All @@ -5213,7 +5213,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
struct mlxsw_sp_fib6_entry *last;

list_for_each_entry(last, &fib_node->entry_list, common.list) {
struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(last);
struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(last);

if (nrt->rt6i_table->tb6_id > rt->rt6i_table->tb6_id)
break;
Expand Down Expand Up @@ -5268,7 +5268,7 @@ mlxsw_sp_fib6_node_entry_unlink(struct mlxsw_sp *mlxsw_sp,

static struct mlxsw_sp_fib6_entry *
mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
const struct rt6_info *rt)
const struct fib6_info *rt)
{
struct mlxsw_sp_fib6_entry *fib6_entry;
struct mlxsw_sp_fib_node *fib_node;
Expand All @@ -5287,7 +5287,7 @@ mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
return NULL;

list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
struct rt6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
struct fib6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);

if (rt->rt6i_table->tb6_id == iter_rt->rt6i_table->tb6_id &&
rt->rt6i_metric == iter_rt->rt6i_metric &&
Expand Down Expand Up @@ -5316,7 +5316,7 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp,
}

static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
struct rt6_info *rt, bool replace)
struct fib6_info *rt, bool replace)
{
struct mlxsw_sp_fib6_entry *fib6_entry;
struct mlxsw_sp_fib_node *fib_node;
Expand Down Expand Up @@ -5373,7 +5373,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
}

static void mlxsw_sp_router_fib6_del(struct mlxsw_sp *mlxsw_sp,
struct rt6_info *rt)
struct fib6_info *rt)
{
struct mlxsw_sp_fib6_entry *fib6_entry;
struct mlxsw_sp_fib_node *fib_node;
Expand Down Expand Up @@ -5836,7 +5836,7 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
fen6_info = container_of(info, struct fib6_entry_notifier_info,
info);
fib_work->fen6_info = *fen6_info;
rt6_hold(fib_work->fen6_info.rt);
fib6_info_hold(fib_work->fen6_info.rt);
break;
}
}
Expand Down
Loading

0 comments on commit 0565de2

Please sign in to comment.