Skip to content

Commit

Permalink
net/ipv6: Simplify route replace and appending into multipath route
Browse files Browse the repository at this point in the history
Bring consistency to ipv6 route replace and append semantics.

Remove rt6_qualify_for_ecmp which is just guess work. It fails in 2 cases:
1. can not replace a route with a reject route. Existing code appends
   a new route instead of replacing the existing one.

2. can not have a multipath route where a leg uses a dev only nexthop

Existing use cases affected by this change:
1. adding a route with existing prefix and metric using NLM_F_CREATE
   without NLM_F_APPEND or NLM_F_EXCL (ie., what iproute2 calls
   'prepend'). Existing code auto-determines that the new nexthop can
   be appended to an existing route to create a multipath route. This
   change breaks that by requiring the APPEND flag for the new route
   to be added to an existing one. Instead the prepend just adds another
   route entry.

2. route replace. Existing code replaces first matching multipath route
   if new route is multipath capable and fallback to first matching
   non-ECMP route (reject or dev only route) in case one isn't available.
   New behavior replaces first matching route. (Thanks to Ido for spotting
   this one)

Note: Newer iproute2 is needed to display multipath routes with a dev-only
      nexthop. This is due to a bug in iproute2 and parsing nexthops.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David Ahern authored and David S. Miller committed May 22, 2018
1 parent 5a15a1b commit f34436a
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 93 deletions.
6 changes: 0 additions & 6 deletions include/net/ip6_route.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}

static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
{
return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
RTF_GATEWAY;
}

void ip6_route_input(struct sk_buff *skb);
struct dst_entry *ip6_route_input_lookup(struct net *net,
struct net_device *dev,
Expand Down
157 changes: 71 additions & 86 deletions net/ipv6/ip6_fib.c
Original file line number Diff line number Diff line change
Expand Up @@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
{
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&rt->fib6_table->tb6_lock));
struct fib6_info *iter = NULL;
struct fib6_info *iter = NULL, *match = NULL;
struct fib6_info __rcu **ins;
struct fib6_info __rcu **fallback_ins = NULL;
int replace = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_REPLACE));
int append = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_APPEND));
int add = (!info->nlh ||
(info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
u16 nlflags = NLM_F_EXCL;
int err;

if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
if (append)
nlflags |= NLM_F_APPEND;

ins = &fn->leaf;
Expand All @@ -968,13 +968,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,

nlflags &= ~NLM_F_EXCL;
if (replace) {
if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
found++;
break;
}
if (rt_can_ecmp)
fallback_ins = fallback_ins ?: ins;
goto next_iter;
found++;
break;
}

if (rt6_duplicate_nexthop(iter, rt)) {
Expand All @@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
return -EEXIST;
}
/* If we have the same destination and the same metric,
* but not the same gateway, then the route we try to
* add is sibling to this route, increment our counter
* of siblings, and later we will add our route to the
* list.
* Only static routes (which don't have flag
* RTF_EXPIRES) are used for ECMPv6.
*
* To avoid long list, we only had siblings if the
* route have a gateway.
*/
if (rt_can_ecmp &&
rt6_qualify_for_ecmp(iter))
rt->fib6_nsiblings++;

/* first route that matches */
if (!match)
match = iter;
}

if (iter->fib6_metric > rt->fib6_metric)
break;

next_iter:
ins = &iter->fib6_next;
}

if (fallback_ins && !found) {
/* No ECMP-able route found, replace first non-ECMP one */
ins = fallback_ins;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
found++;
}

/* Reset round-robin state, if necessary */
if (ins == &fn->leaf)
fn->rr_ptr = NULL;

/* Link this route to others same route. */
if (rt->fib6_nsiblings) {
unsigned int fib6_nsiblings;
if (append && match) {
struct fib6_info *sibling, *temp_sibling;

/* Find the first route that have the same metric */
sibling = leaf;
while (sibling) {
if (sibling->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(sibling)) {
list_add_tail(&rt->fib6_siblings,
&sibling->fib6_siblings);
break;
}
sibling = rcu_dereference_protected(sibling->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock));
if (rt->fib6_flags & RTF_REJECT) {
NL_SET_ERR_MSG(extack,
"Can not append a REJECT route");
return -EINVAL;
} else if (match->fib6_flags & RTF_REJECT) {
NL_SET_ERR_MSG(extack,
"Can not append to a REJECT route");
return -EINVAL;
}
rt->fib6_nsiblings = match->fib6_nsiblings;
list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
match->fib6_nsiblings++;

/* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings
* is broken!
*/
fib6_nsiblings = 0;
list_for_each_entry_safe(sibling, temp_sibling,
&rt->fib6_siblings, fib6_siblings) {
&match->fib6_siblings, fib6_siblings) {
sibling->fib6_nsiblings++;
BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
fib6_nsiblings++;
BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
}
BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
rt6_multipath_rebalance(temp_sibling);

rt6_multipath_rebalance(match);
}

/*
* insert node
*/
if (!replace) {
enum fib_event_type event;

if (!add)
pr_warn("NLM_F_CREATE should be set when creating new route\n");

add:
nlflags |= NLM_F_CREATE;

err = call_fib6_entry_notifiers(info->nl_net,
FIB_EVENT_ENTRY_ADD,
rt, extack);
event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
err = call_fib6_entry_notifiers(info->nl_net, event, rt,
extack);
if (err)
return err;

Expand All @@ -1086,7 +1062,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
}

} else {
int nsiblings;
struct fib6_info *tmp;

if (!found) {
if (add)
Expand All @@ -1101,48 +1077,57 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
if (err)
return err;

/* if route being replaced has siblings, set tmp to
* last one, otherwise tmp is current route. this is
* used to set fib6_next for new route
*/
if (iter->fib6_nsiblings)
tmp = list_last_entry(&iter->fib6_siblings,
struct fib6_info,
fib6_siblings);
else
tmp = iter;

/* insert new route */
atomic_inc(&rt->fib6_ref);
rcu_assign_pointer(rt->fib6_node, fn);
rt->fib6_next = iter->fib6_next;
rt->fib6_next = tmp->fib6_next;
rcu_assign_pointer(*ins, rt);

if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
}
nsiblings = iter->fib6_nsiblings;
iter->fib6_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
fib6_info_release(iter);

if (nsiblings) {
/* delete old route */
rt = iter;

if (rt->fib6_nsiblings) {
struct fib6_info *tmp;

/* Replacing an ECMP route, remove all siblings */
ins = &rt->fib6_next;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) {
if (iter->fib6_metric > rt->fib6_metric)
break;
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->fib6_next;
iter->fib6_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
fib6_info_release(iter);
nsiblings--;
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
} else {
ins = &iter->fib6_next;
}
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
fib6_siblings) {
iter->fib6_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
fib6_info_release(iter);

rt->fib6_nsiblings--;
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
}
WARN_ON(nsiblings != 0);
}

WARN_ON(rt->fib6_nsiblings != 0);

rt->fib6_node = NULL;
fib6_purge_rt(rt, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == rt)
fn->rr_ptr = NULL;
fib6_info_release(rt);
}

return 0;
Expand Down
3 changes: 2 additions & 1 deletion net/ipv6/route.c
Original file line number Diff line number Diff line change
Expand Up @@ -3791,7 +3791,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) {
if (iter->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(iter))
iter->fib6_nsiblings)
return iter;
iter = rcu_dereference_protected(iter->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock));
Expand Down Expand Up @@ -4381,6 +4381,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
*/
cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
NLM_F_REPLACE);
cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
nhn++;
}

Expand Down

0 comments on commit f34436a

Please sign in to comment.