Skip to content

Commit

Permalink
net/ipv6: Add knob to skip DELROUTE message on device down
Browse files Browse the repository at this point in the history
Another difference between IPv4 and IPv6 is the generation of RTM_DELROUTE
notifications when a device is taken down (admin down) or deleted. IPv4
does not generate a message for routes evicted by the down or delete;
IPv6 does. A NOS at scale really needs to avoid these messages and have
IPv4 and IPv6 behave similarly, relying on userspace to handle link
notifications and evict the routes.

At this point existing user behavior needs to be preserved. Since
notifications are a global action (not per app) the only way to preserve
existing behavior and allow the messages to be skipped is to add a new
sysctl (net/ipv6/route/skip_notify_on_dev_down) which can be set to
disable the notificatioons.

IPv6 route code already supports the option to skip the message (it is
used for multipath routes for example). Besides the new sysctl we need
to pass the skip_notify setting through the generic fib6_clean and
fib6_walk functions to fib6_clean_node and to set skip_notify on calls
to __ip_del_rt for the addrconf_ifdown path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David Ahern authored and David S. Miller committed Oct 12, 2018
1 parent 7cc2d50 commit 7c6bb7d
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 6 deletions.
8 changes: 8 additions & 0 deletions Documentation/networking/ip-sysctl.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1442,6 +1442,14 @@ max_hbh_length - INTEGER
header.
Default: INT_MAX (unlimited)

skip_notify_on_dev_down - BOOLEAN
Controls whether an RTM_DELROUTE message is generated for routes
removed when a device is taken down or deleted. IPv4 does not
generate this message; IPv6 does by default. Setting this sysctl
to true skips the message, making IPv4 and IPv6 on par in relying
on userspace caches to track link events and evict routes.
Default: false (generate message)

IPv6 Fragmentation:

ip6frag_high_thresh - INTEGER
Expand Down
3 changes: 3 additions & 0 deletions include/net/ip6_fib.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,9 @@ struct fib6_node *fib6_locate(struct fib6_node *root,

void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
void *arg);
void fib6_clean_all_skip_notify(struct net *net,
int (*func)(struct fib6_info *, void *arg),
void *arg);

int fib6_add(struct fib6_node *root, struct fib6_info *rt,
struct nl_info *info, struct netlink_ext_ack *extack);
Expand Down
1 change: 1 addition & 0 deletions include/net/netns/ipv6.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct netns_sysctl_ipv6 {
int max_dst_opts_len;
int max_hbh_opts_len;
int seg6_flowlabel;
bool skip_notify_on_dev_down;
};

struct netns_ipv6 {
Expand Down
20 changes: 15 additions & 5 deletions net/ipv6/ip6_fib.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct fib6_cleaner {
int (*func)(struct fib6_info *, void *arg);
int sernum;
void *arg;
bool skip_notify;
};

#ifdef CONFIG_IPV6_SUBTREES
Expand Down Expand Up @@ -1956,6 +1957,7 @@ static int fib6_clean_node(struct fib6_walker *w)
struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
struct nl_info info = {
.nl_net = c->net,
.skip_notify = c->skip_notify,
};

if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
Expand Down Expand Up @@ -2007,7 +2009,7 @@ static int fib6_clean_node(struct fib6_walker *w)

static void fib6_clean_tree(struct net *net, struct fib6_node *root,
int (*func)(struct fib6_info *, void *arg),
int sernum, void *arg)
int sernum, void *arg, bool skip_notify)
{
struct fib6_cleaner c;

Expand All @@ -2019,13 +2021,14 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
c.sernum = sernum;
c.arg = arg;
c.net = net;
c.skip_notify = skip_notify;

fib6_walk(net, &c.w);
}

static void __fib6_clean_all(struct net *net,
int (*func)(struct fib6_info *, void *),
int sernum, void *arg)
int sernum, void *arg, bool skip_notify)
{
struct fib6_table *table;
struct hlist_head *head;
Expand All @@ -2037,7 +2040,7 @@ static void __fib6_clean_all(struct net *net,
hlist_for_each_entry_rcu(table, head, tb6_hlist) {
spin_lock_bh(&table->tb6_lock);
fib6_clean_tree(net, &table->tb6_root,
func, sernum, arg);
func, sernum, arg, skip_notify);
spin_unlock_bh(&table->tb6_lock);
}
}
Expand All @@ -2047,14 +2050,21 @@ static void __fib6_clean_all(struct net *net,
void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
void *arg)
{
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
}

void fib6_clean_all_skip_notify(struct net *net,
int (*func)(struct fib6_info *, void *),
void *arg)
{
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
}

static void fib6_flush_trees(struct net *net)
{
int new_sernum = fib6_new_sernum(net);

__fib6_clean_all(net, NULL, new_sernum, NULL);
__fib6_clean_all(net, NULL, new_sernum, NULL, false);
}

/*
Expand Down
20 changes: 19 additions & 1 deletion net/ipv6/route.c
Original file line number Diff line number Diff line change
Expand Up @@ -4026,8 +4026,12 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
.event = event,
},
};
struct net *net = dev_net(dev);

fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
if (net->ipv6.sysctl.skip_notify_on_dev_down)
fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
else
fib6_clean_all(net, fib6_ifdown, &arg);
}

void rt6_disable_ip(struct net_device *dev, unsigned long event)
Expand Down Expand Up @@ -5031,6 +5035,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
return 0;
}

static int zero;
static int one = 1;

static struct ctl_table ipv6_route_table_template[] = {
{
.procname = "flush",
Expand Down Expand Up @@ -5102,6 +5109,15 @@ static struct ctl_table ipv6_route_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
{
.procname = "skip_notify_on_dev_down",
.data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
.extra2 = &one,
},
{ }
};

Expand All @@ -5125,6 +5141,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;

/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
Expand Down Expand Up @@ -5189,6 +5206,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
net->ipv6.sysctl.skip_notify_on_dev_down = 0;

net->ipv6.ip6_rt_gc_expire = 30*HZ;

Expand Down

0 comments on commit 7c6bb7d

Please sign in to comment.