Skip to content

Commit

Permalink
Merge branch 'ipv6-ipv4-nexthop-align'
Browse files Browse the repository at this point in the history
Ido Schimmel says:

====================
ipv6: Align nexthop behaviour with IPv4

This set tries to eliminate some differences between IPv4's and IPv6's
treatment of nexthops. These differences are most likely a side effect
of IPv6's data structures (specifically 'rt6_info') that incorporate
both the route and the nexthop and the late addition of ECMP support in
commit 51ebd31 ("ipv6: add support of equal cost multipath
(ECMP)").

IPv4 and IPv6 do not react the same to certain netdev events. For
example, upon carrier change affected IPv4 nexthops are marked using the
RTNH_F_LINKDOWN flag and the nexthop group is rebalanced accordingly.
IPv6 on the other hand, does nothing which forces us to perform a
carrier check during route lookup and dump. This makes it difficult to
introduce features such as non-equal-cost multipath that are built on
top of this set [1].

In addition, when a netdev is put administratively down IPv4 nexthops
are marked using the RTNH_F_DEAD flag, whereas IPv6 simply flushes all
the routes using these nexthops. To be consistent with IPv4, multipath
routes should only be flushed when all nexthops in the group are
considered dead.

The first 12 patches introduce non-functional changes that store the
RTNH_F_DEAD and RTNH_F_LINKDOWN flags in IPv6 routes based on netdev
events, in a similar fashion to IPv4. This allows us to remove the
carrier check performed during route lookup and dump.

The next three patches make sure we only flush a multipath route when
all of its nexthops are dead.

Last three patches add test cases for IPv4/IPv6 FIB. These verify that
both address families react similarly to netdev events.

Finally, this series also serves as a good first step towards David
Ahern's goal of treating nexthops as standalone objects [2], as it makes
the code more in line with IPv4 where the nexthop and the nexthop group
are separate objects from the route itself.

1. https://github.com/idosch/linux/tree/ipv6-nexthops
2. http://vger.kernel.org/netconf2017_files/nexthop-objects.pdf

Changes since RFC (feedback from David Ahern):
* Remove redundant declaration of rt6_ifdown() in patch 4 and adjust
comment referencing it accordingly
* Drop patch to flush multipath routes upon NETDEV_UNREGISTER. Reword
cover letter accordingly
* Use a temporary variable to make code more readable in patch 15
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Jan 8, 2018
2 parents 7f0b800 + 82e45b6 commit f66faae
Show file tree
Hide file tree
Showing 7 changed files with 618 additions and 42 deletions.
4 changes: 3 additions & 1 deletion include/net/ip6_fib.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ struct rt6_info {
unsigned short rt6i_nfheader_len;
u8 rt6i_protocol;
u8 exception_bucket_flushed:1,
unused:7;
should_flush:1,
unused:6;
};

#define for_each_fib6_node_rt_rcu(fn) \
Expand Down Expand Up @@ -404,6 +405,7 @@ unsigned int fib6_tables_seq_read(struct net *net);
int fib6_tables_dump(struct net *net, struct notifier_block *nb);

void fib6_update_sernum(struct rt6_info *rt);
void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt);

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
int fib6_rules_init(void);
Expand Down
4 changes: 3 additions & 1 deletion include/net/ip6_route.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,12 @@ struct rt6_rtnl_dump_arg {
};

int rt6_dump_route(struct rt6_info *rt, void *p_arg);
void rt6_ifdown(struct net *net, struct net_device *dev);
void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
void rt6_disable_ip(struct net_device *dev, unsigned long event);
void rt6_sync_down_dev(struct net_device *dev, unsigned long event);

static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
{
Expand Down
9 changes: 7 additions & 2 deletions net/ipv6/addrconf.c
Original file line number Diff line number Diff line change
Expand Up @@ -3438,6 +3438,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
} else if (event == NETDEV_CHANGE) {
if (!addrconf_link_ready(dev)) {
/* device is still not ready. */
rt6_sync_down_dev(dev, event);
break;
}

Expand All @@ -3449,6 +3450,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
* multicast snooping switches
*/
ipv6_mc_up(idev);
rt6_sync_up(dev, RTNH_F_LINKDOWN);
break;
}
idev->if_flags |= IF_READY;
Expand Down Expand Up @@ -3484,6 +3486,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
if (run_pending)
addrconf_dad_run(idev);

/* Device has an address by now */
rt6_sync_up(dev, RTNH_F_DEAD);

/*
* If the MTU changed during the interface down,
* when the interface up, the changed MTU must be
Expand Down Expand Up @@ -3577,6 +3582,7 @@ static bool addr_is_local(const struct in6_addr *addr)

static int addrconf_ifdown(struct net_device *dev, int how)
{
unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN;
struct net *net = dev_net(dev);
struct inet6_dev *idev;
struct inet6_ifaddr *ifa, *tmp;
Expand All @@ -3586,8 +3592,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)

ASSERT_RTNL();

rt6_ifdown(net, dev);
neigh_ifdown(&nd_tbl, dev);
rt6_disable_ip(dev, event);

idev = __in6_dev_get(dev);
if (!idev)
Expand Down
28 changes: 18 additions & 10 deletions net/ipv6/ip6_fib.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,13 @@ enum {

void fib6_update_sernum(struct rt6_info *rt)
{
struct fib6_table *table = rt->rt6i_table;
struct net *net = dev_net(rt->dst.dev);
struct fib6_node *fn;

spin_lock_bh(&table->tb6_lock);
fn = rcu_dereference_protected(rt->rt6i_node,
lockdep_is_held(&table->tb6_lock));
lockdep_is_held(&rt->rt6i_table->tb6_lock));
if (fn)
fn->fn_sernum = fib6_new_sernum(net);
spin_unlock_bh(&table->tb6_lock);
}

/*
Expand Down Expand Up @@ -1102,8 +1099,8 @@ void fib6_force_start_gc(struct net *net)
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}

static void fib6_update_sernum_upto_root(struct rt6_info *rt,
int sernum)
static void __fib6_update_sernum_upto_root(struct rt6_info *rt,
int sernum)
{
struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
lockdep_is_held(&rt->rt6i_table->tb6_lock));
Expand All @@ -1117,6 +1114,11 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt,
}
}

void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
{
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}

/*
* Add routing information to the routing tree.
* <destination addr>/<source addr>
Expand Down Expand Up @@ -1230,7 +1232,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,

err = fib6_add_rt2node(fn, rt, info, mxc, extack);
if (!err) {
fib6_update_sernum_upto_root(rt, sernum);
__fib6_update_sernum_upto_root(rt, sernum);
fib6_start_gc(info->nl_net, rt);
}

Expand Down Expand Up @@ -1887,7 +1889,7 @@ static int fib6_clean_node(struct fib6_walker *w)

for_each_fib6_walker_rt(w) {
res = c->func(rt, c->arg);
if (res < 0) {
if (res == -1) {
w->leaf = rt;
res = fib6_del(rt, &info);
if (res) {
Expand All @@ -1900,6 +1902,12 @@ static int fib6_clean_node(struct fib6_walker *w)
continue;
}
return 0;
} else if (res == -2) {
if (WARN_ON(!rt->rt6i_nsiblings))
continue;
rt = list_last_entry(&rt->rt6i_siblings,
struct rt6_info, rt6i_siblings);
continue;
}
WARN_ON(res != 0);
}
Expand All @@ -1911,7 +1919,8 @@ static int fib6_clean_node(struct fib6_walker *w)
* Convenient frontend to tree walker.
*
* func is called on each route.
* It may return -1 -> delete this route.
* It may return -2 -> skip multipath route.
* -1 -> delete this route.
* 0 -> continue walking
*/

Expand Down Expand Up @@ -2103,7 +2112,6 @@ static void fib6_net_exit(struct net *net)
{
unsigned int i;

rt6_ifdown(net, NULL);
del_timer_sync(&net->ipv6.ip6_fib_timer);

for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
Expand Down
Loading

0 comments on commit f66faae

Please sign in to comment.