From f40b6ae2b612446dc970d7b51eeec47bd1619f82 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 May 2019 20:27:55 -0700 Subject: [PATCH 1/7] ipv6: Move pcpu cached routes to fib6_nh rt6_info are specific instances of a fib entry and are tied to a device and gateway - ie., a nexthop. Before nexthop objects, IPv6 fib entries have separate fib6_info for each nexthop in a multipath route, so the location of the pcpu cache in the fib6_info struct worked. However, with nexthop objects a fib6_info can point to a set of nexthops (yet another alignment of ipv6 with ipv4). Accordingly, the pcpu cache needs to be moved to the fib6_nh struct so the cached entries are local to the nexthop specification used to create the rt6_info. Initialization and free of the pcpu entries moved to fib6_nh_init and fib6_nh_release. Change in location only, from fib6_info down to fib6_nh; no other functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- net/ipv6/addrconf.c | 6 +++--- net/ipv6/ip6_fib.c | 34 ++++++---------------------------- net/ipv6/route.c | 29 +++++++++++++++++++++++++++-- 4 files changed, 38 insertions(+), 34 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0d0d06b1cd26d..38e87ef81b7ee 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -131,6 +131,8 @@ struct fib6_nh { #ifdef CONFIG_IPV6_ROUTER_PREF unsigned long last_probe; #endif + + struct rt6_info * __percpu *rt6i_pcpu; }; struct fib6_info { @@ -156,7 +158,6 @@ struct fib6_info { struct rt6key fib6_src; struct rt6key fib6_prefsrc; - struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 fib6_metric; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f96d1de79509d..4bc35dd02b568 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6341,16 +6341,16 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct fib6_info *rt = ifa->rt; + struct fib6_nh *nh = &ifa->rt->fib6_nh; int cpu; rcu_read_lock(); ifa->rt->dst_nopolicy = val ? true : false; - if (rt->rt6i_pcpu) { + if (nh->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; - rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu); + rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu); addrconf_set_nopolicy(*rtp, val); } } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7958cf91895a5..274f1243866f2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -155,12 +155,6 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) if (!f6i) return NULL; - f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); - if (!f6i->rt6i_pcpu) { - kfree(f6i); - return NULL; - } - INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); @@ -177,25 +171,6 @@ void fib6_info_destroy_rcu(struct rcu_head *head) bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); kfree(bucket); - if (f6i->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } - - free_percpu(f6i->rt6i_pcpu); - } - fib6_nh_release(&f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); @@ -902,8 +877,12 @@ static struct fib6_node *fib6_add_1(struct net *net, static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table) { + struct fib6_nh *fib6_nh = &f6i->fib6_nh; int cpu; + if (!fib6_nh->rt6i_pcpu) + return; + /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ @@ -917,7 +896,7 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { struct fib6_info *from; @@ -933,8 +912,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, { struct fib6_table *table = rt->fib6_table; - if (rt->rt6i_pcpu) - fib6_drop_pcpu_from(rt, table); + fib6_drop_pcpu_from(rt, table); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5f0661c18624d..e404813c9844a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1270,7 +1270,7 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { struct rt6_info *pcpu_rt, **p; - p = this_cpu_ptr(res->f6i->rt6i_pcpu); + p = this_cpu_ptr(res->nh->rt6i_pcpu); pcpu_rt = *p; if (pcpu_rt) @@ -1291,7 +1291,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, } dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(res->f6i->rt6i_pcpu); + p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -3068,6 +3068,12 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, !netif_carrier_ok(dev)) fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!fib6_nh->rt6i_pcpu) { + err = -ENOMEM; + goto out; + } + err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) @@ -3092,6 +3098,25 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, void fib6_nh_release(struct fib6_nh *fib6_nh) { + if (fib6_nh->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + + free_percpu(fib6_nh->rt6i_pcpu); + } + fib_nh_common_release(&fib6_nh->nh_common); } From 7d88d8b5570b6adba39289487138cff38f84336d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 May 2019 20:27:56 -0700 Subject: [PATCH 2/7] ipv6: Refactor fib6_drop_pcpu_from Move the existing pcpu walk in fib6_drop_pcpu_from to a new helper, __fib6_drop_pcpu_from, that can be invoked per fib6_nh with a reference to the from entries that need to be evicted. If the passed in 'from' is non-NULL then only entries associated with that fib6_info are removed (e.g., case where fib entry is deleted); if the 'from' is NULL are entries are flushed (e.g., fib6_nh is deleted). For fib6_info entries with builtin fib6_nh (ie., current code) there is no change in behavior. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 274f1243866f2..178a9c2d2d346 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -874,21 +874,15 @@ static struct fib6_node *fib6_add_1(struct net *net, return ln; } -static void fib6_drop_pcpu_from(struct fib6_info *f6i, - const struct fib6_table *table) +static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, + const struct fib6_info *match, + const struct fib6_table *table) { - struct fib6_nh *fib6_nh = &f6i->fib6_nh; int cpu; if (!fib6_nh->rt6i_pcpu) return; - /* Make sure rt6_make_pcpu_route() wont add other percpu routes - * while we are cleaning them here. - */ - f6i->fib6_destroying = 1; - mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ - /* release the reference to this fib entry from * all of its cached pcpu routes */ @@ -898,7 +892,13 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { + + /* only dropping the 'from' reference if the cached route + * is using 'match'. The cached pcpu_rt->from only changes + * from a fib6_info to NULL (ip6_dst_destroy); it can never + * change from one fib6_info reference to another + */ + if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); @@ -907,6 +907,21 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, } } +static void fib6_drop_pcpu_from(struct fib6_info *f6i, + const struct fib6_table *table) +{ + struct fib6_nh *fib6_nh; + + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + + fib6_nh = &f6i->fib6_nh; + __fib6_drop_pcpu_from(fib6_nh, f6i, table); +} + static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { From c0b220cf7d80eb8a36ae9f12cae6df4577a6814e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 May 2019 20:27:57 -0700 Subject: [PATCH 3/7] ipv6: Refactor exception functions Before moving exception bucket from fib6_info to fib6_nh, refactor rt6_flush_exceptions, rt6_remove_exception_rt, rt6_mtu_change_route, and rt6_update_exception_stamp_rt. In all 3 cases, move the primary logic into a new helper that starts with fib6_nh_. The latter 3 functions still take a fib6_info; this will be changed to fib6_nh in the next patch. In the case of rt6_mtu_change_route, move the fib6_metric_locked out as a standalone check - no need to call the new function if the fib entry has the mtu locked. Also, add fib6_info to rt6_mtu_change_arg as a way of passing the fib entry to the new helper. No functional change intended. The goal here is to make the next patch easier to review by moving existing lookup logic for each to new helpers. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 134 ++++++++++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 48 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e404813c9844a..8bfaa7349e10e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1542,7 +1542,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, return err; } -void rt6_flush_exceptions(struct fib6_info *rt) +static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1551,9 +1551,9 @@ void rt6_flush_exceptions(struct fib6_info *rt) spin_lock_bh(&rt6_exception_lock); /* Prevent rt6_insert_exception() to recreate the bucket list */ - rt->exception_bucket_flushed = 1; + from->exception_bucket_flushed = 1; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + bucket = rcu_dereference_protected(from->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); if (!bucket) goto out; @@ -1569,6 +1569,11 @@ void rt6_flush_exceptions(struct fib6_info *rt) spin_unlock_bh(&rt6_exception_lock); } +void rt6_flush_exceptions(struct fib6_info *f6i) +{ + fib6_nh_flush_exceptions(&f6i->fib6_nh, f6i); +} + /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ @@ -1615,19 +1620,14 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, } /* Remove the passed in cached rt from the hash table that contains it */ -static int rt6_remove_exception_rt(struct rt6_info *rt) +static int fib6_nh_remove_exception(const struct fib6_info *from, int plen, + const struct rt6_info *rt) { + const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *from; int err; - from = rcu_dereference(rt->from); - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; - if (!rcu_access_pointer(from->rt6i_exception_bucket)) return -ENOENT; @@ -1635,13 +1635,12 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) bucket = rcu_dereference_protected(from->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); #ifdef CONFIG_IPV6_SUBTREES - /* rt6i_src.plen != 0 indicates 'from' is in subtree - * and exception table is indexed by a hash of - * both rt6i_dst and rt6i_src. + /* plen != 0 indicates 'from' is in subtree and exception + * table is indexed by a hash of both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1658,31 +1657,37 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) return err; } +static int rt6_remove_exception_rt(struct rt6_info *rt) +{ + struct fib6_info *from; + + from = rcu_dereference(rt->from); + if (!from || + !(rt->rt6i_flags & RTF_CACHE)) + return -EINVAL; + + return fib6_nh_remove_exception(from, from->fib6_src.plen, rt); +} + /* Find rt6_ex which contains the passed in rt cache and * refresh its stamp */ -static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +static void fib6_nh_update_exception(const struct fib6_info *from, int plen, + const struct rt6_info *rt) { + const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *from; - - rcu_read_lock(); - from = rcu_dereference(rt->from); - if (!from || !(rt->rt6i_flags & RTF_CACHE)) - goto unlock; bucket = rcu_dereference(from->rt6i_exception_bucket); #ifdef CONFIG_IPV6_SUBTREES - /* rt6i_src.plen != 0 indicates 'from' is in subtree - * and exception table is indexed by a hash of - * both rt6i_dst and rt6i_src. + /* plen != 0 indicates 'from' is in subtree and exception + * table is indexed by a hash of both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, @@ -1690,7 +1695,19 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) src_key); if (rt6_ex) rt6_ex->stamp = jiffies; +} + +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct fib6_info *from; + rcu_read_lock(); + + from = rcu_dereference(rt->from); + if (!from || !(rt->rt6i_flags & RTF_CACHE)) + goto unlock; + + fib6_nh_update_exception(from, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } @@ -1827,9 +1844,9 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct fib6_info *rt, - struct fib6_gc_args *gc_args, - unsigned long now) +static void fib6_nh_age_exceptions(struct fib6_info *rt, + struct fib6_gc_args *gc_args, + unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1858,6 +1875,13 @@ void rt6_age_exceptions(struct fib6_info *rt, rcu_read_unlock_bh(); } +void rt6_age_exceptions(struct fib6_info *rt, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + fib6_nh_age_exceptions(rt, gc_args, now); +} + /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) @@ -4198,9 +4222,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event) struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; + struct fib6_info *f6i; }; -static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) +static int fib6_nh_mtu_change(struct fib6_info *f6i, void *_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; + struct fib6_nh *nh = &f6i->fib6_nh; + + /* For administrative MTU increase, there is no way to discover + * IPv6 PMTU increase, so PMTU increase should be updated here. + * Since RFC 1981 doesn't include administrative MTU increase + * update PMTU increase is a MUST. (i.e. jumbo frame) + */ + if (nh->fib_nh_dev == arg->dev) { + struct inet6_dev *idev = __in6_dev_get(arg->dev); + u32 mtu = f6i->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(f6i, RTAX_MTU, arg->mtu); + + spin_lock_bh(&rt6_exception_lock); + rt6_exceptions_update_pmtu(idev, f6i, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); + } + + return 0; +} + +static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -4215,24 +4266,11 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) if (!idev) return 0; - /* For administrative MTU increase, there is no way to discover - IPv6 PMTU increase, so PMTU increase should be updated here. - Since RFC 1981 doesn't include administrative MTU increase - update PMTU increase is a MUST. (i.e. jumbo frame) - */ - if (rt->fib6_nh.fib_nh_dev == arg->dev && - !fib6_metric_locked(rt, RTAX_MTU)) { - u32 mtu = rt->fib6_pmtu; - - if (mtu >= arg->mtu || - (mtu < arg->mtu && mtu == idev->cnf.mtu6)) - fib6_metric_set(rt, RTAX_MTU, arg->mtu); + if (fib6_metric_locked(f6i, RTAX_MTU)) + return 0; - spin_lock_bh(&rt6_exception_lock); - rt6_exceptions_update_pmtu(idev, rt, arg->mtu); - spin_unlock_bh(&rt6_exception_lock); - } - return 0; + arg->f6i = f6i; + return fib6_nh_mtu_change(f6i, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) From cc5c073a693fa6ed7a207b0436114f68cce72434 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 May 2019 20:27:58 -0700 Subject: [PATCH 4/7] ipv6: Move exception bucket to fib6_nh Similar to the pcpu routes exceptions are really per nexthop, so move rt6i_exception_bucket from fib6_info to fib6_nh. To avoid additional increases to the size of fib6_nh for a 1-bit flag, use the lowest bit in the allocated memory pointer for the flushed flag. Add helpers for retrieving the bucket pointer to mask off the flag. The cleanup of the exception bucket is moved to fib6_nh_release. fib6_nh_flush_exceptions can now be called from 2 contexts: 1. deleting a fib entry 2. deleting a fib6_nh For 1., fib6_nh_flush_exceptions is called for a specific fib6_info that is getting deleted. All exceptions in the cache using the entry are deleted. For 2, the fib6_nh itself is getting destroyed so fib6_nh_flush_exceptions is called for a NULL fib6_info which means flush all entries. The pmtu.sh selftest exercises the affected code paths - from creating exceptions to cleaning them up on device delete. All tests pass without any rcu locking or memleak warnings. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 8 +- net/ipv6/ip6_fib.c | 6 -- net/ipv6/route.c | 185 ++++++++++++++++++++++++++++-------------- 3 files changed, 126 insertions(+), 73 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 38e87ef81b7ee..6b4852cf2fc24 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -133,6 +133,7 @@ struct fib6_nh { #endif struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; }; struct fib6_info { @@ -158,18 +159,15 @@ struct fib6_info { struct rt6key fib6_src; struct rt6key fib6_prefsrc; - struct rt6_exception_bucket __rcu *rt6i_exception_bucket; - u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; - u8 exception_bucket_flushed:1, - should_flush:1, + u8 should_flush:1, dst_nocount:1, dst_nopolicy:1, dst_host:1, fib6_destroying:1, - unused:2; + unused:3; struct fib6_nh fib6_nh; struct rcu_head rcu; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 178a9c2d2d346..87ac82f850d29 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -164,17 +164,11 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); - struct rt6_exception_bucket *bucket; WARN_ON(f6i->fib6_node); - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); - kfree(bucket); - fib6_nh_release(&f6i->fib6_nh); - ip_fib_metrics_put(f6i->fib6_metrics); - kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8bfaa7349e10e..b01118a3c42e8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1461,25 +1461,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res) return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } +#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL + +/* used when the flushed bit is not relevant, only access to the bucket + * (ie., all bucket users except rt6_insert_exception); + * + * called under rcu lock; sometimes called with rt6_exception_lock held + */ +static +struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + + if (lock) + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + else + bucket = rcu_dereference(nh->rt6i_exception_bucket); + + /* remove bucket flushed bit if set */ + if (bucket) { + unsigned long p = (unsigned long)bucket; + + p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + } + + return bucket; +} + +static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) +{ + unsigned long p = (unsigned long)bucket; + + return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); +} + +/* called with rt6_exception_lock held */ +static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + unsigned long p; + + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + + p = (unsigned long)bucket; + p |= FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); +} + static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; + struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *f6i = res->f6i; + struct fib6_nh *nh = res->nh; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (f6i->exception_bucket_flushed) { - err = -EINVAL; - goto out; - } - - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); @@ -1487,7 +1536,10 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); + } else if (fib6_nh_excptn_bucket_flushed(bucket)) { + err = -EINVAL; + goto out; } #ifdef CONFIG_IPV6_SUBTREES @@ -1550,21 +1602,24 @@ static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) int i; spin_lock_bh(&rt6_exception_lock); - /* Prevent rt6_insert_exception() to recreate the bucket list */ - from->exception_bucket_flushed = 1; - bucket = rcu_dereference_protected(from->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; + /* Prevent rt6_insert_exception() to recreate the bucket list */ + if (!from) + fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) - rt6_remove_exception(bucket, rt6_ex); - WARN_ON_ONCE(bucket->depth); + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { + if (!from || + rcu_access_pointer(rt6_ex->rt6i->from) == from) + rt6_remove_exception(bucket, rt6_ex); + } + WARN_ON_ONCE(!from && bucket->depth); bucket++; } - out: spin_unlock_bh(&rt6_exception_lock); } @@ -1602,7 +1657,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, src_key = saddr; find_ex: #endif - bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); + bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) @@ -1620,7 +1675,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, } /* Remove the passed in cached rt from the hash table that contains it */ -static int fib6_nh_remove_exception(const struct fib6_info *from, int plen, +static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; @@ -1628,15 +1683,16 @@ static int fib6_nh_remove_exception(const struct fib6_info *from, int plen, struct rt6_exception *rt6_ex; int err; - if (!rcu_access_pointer(from->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(from->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); + #ifdef CONFIG_IPV6_SUBTREES - /* plen != 0 indicates 'from' is in subtree and exception - * table is indexed by a hash of both rt6i_dst and rt6i_src. + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ @@ -1662,37 +1718,35 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) struct fib6_info *from; from = rcu_dereference(rt->from); - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) + if (!from || !(rt->rt6i_flags & RTF_CACHE)) return -EINVAL; - return fib6_nh_remove_exception(from, from->fib6_src.plen, rt); + return fib6_nh_remove_exception(&from->fib6_nh, + from->fib6_src.plen, rt); } /* Find rt6_ex which contains the passed in rt cache and * refresh its stamp */ -static void fib6_nh_update_exception(const struct fib6_info *from, int plen, +static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; - bucket = rcu_dereference(from->rt6i_exception_bucket); - + bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES - /* plen != 0 indicates 'from' is in subtree and exception - * table is indexed by a hash of both rt6i_dst and rt6i_src. + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ if (plen) src_key = &rt->rt6i_src.addr; #endif - rt6_ex = __rt6_find_exception_rcu(&bucket, - &rt->rt6i_dst.addr, - src_key); + rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; } @@ -1707,7 +1761,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) if (!from || !(rt->rt6i_flags & RTF_CACHE)) goto unlock; - fib6_nh_update_exception(from, from->fib6_src.plen, rt); + fib6_nh_update_exception(&from->fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } @@ -1735,15 +1789,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct fib6_info *rt, int mtu) + const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; @@ -1765,21 +1817,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct fib6_info *rt, - struct in6_addr *gateway) +static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, + const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1844,7 +1894,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -static void fib6_nh_age_exceptions(struct fib6_info *rt, +static void fib6_nh_age_exceptions(const struct fib6_nh *nh, struct fib6_gc_args *gc_args, unsigned long now) { @@ -1853,14 +1903,12 @@ static void fib6_nh_age_exceptions(struct fib6_info *rt, struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1875,11 +1923,11 @@ static void fib6_nh_age_exceptions(struct fib6_info *rt, rcu_read_unlock_bh(); } -void rt6_age_exceptions(struct fib6_info *rt, +void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now) { - fib6_nh_age_exceptions(rt, gc_args, now); + fib6_nh_age_exceptions(&f6i->fib6_nh, gc_args, now); } /* must be called with rcu lock held */ @@ -3122,6 +3170,19 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, void fib6_nh_release(struct fib6_nh *fib6_nh) { + struct rt6_exception_bucket *bucket; + + rcu_read_lock(); + + fib6_nh_flush_exceptions(fib6_nh, NULL); + bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); + if (bucket) { + rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); + kfree(bucket); + } + + rcu_read_unlock(); + if (fib6_nh->rt6i_pcpu) { int cpu; @@ -3411,9 +3472,11 @@ static int ip6_route_del(struct fib6_config *cfg, for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; + nh = &rt->fib6_nh; if (cfg->fc_flags & RTF_CACHE) { struct fib6_result res = { .f6i = rt, + .nh = nh, }; int rc; @@ -3430,7 +3493,6 @@ static int ip6_route_del(struct fib6_config *cfg, continue; } - nh = &rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) @@ -3947,18 +4009,17 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + struct fib6_nh *nh = &rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - rt->fib6_nh.fib_nh_gw_family && - ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { + nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; - } /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ - rt6_exceptions_clean_tohost(rt, gateway); + fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } @@ -4225,10 +4286,10 @@ struct rt6_mtu_change_arg { struct fib6_info *f6i; }; -static int fib6_nh_mtu_change(struct fib6_info *f6i, void *_arg) +static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; - struct fib6_nh *nh = &f6i->fib6_nh; + struct fib6_info *f6i = arg->f6i; /* For administrative MTU increase, there is no way to discover * IPv6 PMTU increase, so PMTU increase should be updated here. @@ -4244,7 +4305,7 @@ static int fib6_nh_mtu_change(struct fib6_info *f6i, void *_arg) fib6_metric_set(f6i, RTAX_MTU, arg->mtu); spin_lock_bh(&rt6_exception_lock); - rt6_exceptions_update_pmtu(idev, f6i, arg->mtu); + rt6_exceptions_update_pmtu(idev, nh, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } @@ -4270,7 +4331,7 @@ static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) return 0; arg->f6i = f6i; - return fib6_nh_mtu_change(f6i, arg); + return fib6_nh_mtu_change(&f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) From 1cf844c747d5424abe76f7b599c00b1ac17d3fce Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 May 2019 20:27:59 -0700 Subject: [PATCH 5/7] ipv6: Make fib6_nh optional at the end of fib6_info Move fib6_nh to the end of fib6_info and make it an array of size 0. Pass a flag to fib6_info_alloc indicating if the allocation needs to add space for a fib6_nh. The current code path always has a fib6_nh allocated with a fib6_info; with nexthop objects they will be separate. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 31 ++-- include/net/ip6_fib.h | 6 +- include/net/ip6_route.h | 4 +- net/ipv6/addrconf.c | 6 +- net/ipv6/ip6_fib.c | 18 ++- net/ipv6/ndisc.c | 8 +- net/ipv6/route.c | 134 +++++++++--------- 7 files changed, 106 insertions(+), 101 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 1cda8a248b12e..0ec52be7cc333 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2886,7 +2886,7 @@ mlxsw_sp_nexthop6_group_cmp(const struct mlxsw_sp_nexthop_group *nh_grp, return false; list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { - struct fib6_nh *fib6_nh = &mlxsw_sp_rt6->rt->fib6_nh; + struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh; struct in6_addr *gw; int ifindex, weight; @@ -2958,7 +2958,7 @@ mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed) struct net_device *dev; list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { - dev = mlxsw_sp_rt6->rt->fib6_nh.fib_nh_dev; + dev = mlxsw_sp_rt6->rt->fib6_nh->fib_nh_dev; val ^= dev->ifindex; } @@ -3960,9 +3960,9 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp, struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i]; struct fib6_info *rt = mlxsw_sp_rt6->rt; - if (nh->rif && nh->rif->dev == rt->fib6_nh.fib_nh_dev && + if (nh->rif && nh->rif->dev == rt->fib6_nh->fib_nh_dev && ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr, - &rt->fib6_nh.fib_nh_gw6)) + &rt->fib6_nh->fib_nh_gw6)) return nh; continue; } @@ -4022,13 +4022,13 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry) if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL || fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_BLACKHOLE) { list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6, - list)->rt->fib6_nh.fib_nh_flags |= RTNH_F_OFFLOAD; + list)->rt->fib6_nh->fib_nh_flags |= RTNH_F_OFFLOAD; return; } list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group; - struct fib6_nh *fib6_nh = &mlxsw_sp_rt6->rt->fib6_nh; + struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh; struct mlxsw_sp_nexthop *nh; nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6); @@ -4050,7 +4050,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry) list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { struct fib6_info *rt = mlxsw_sp_rt6->rt; - rt->fib6_nh.fib_nh_flags &= ~RTNH_F_OFFLOAD; + rt->fib6_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD; } } @@ -4928,7 +4928,8 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6) static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt) { /* RTF_CACHE routes are ignored */ - return !(rt->fib6_flags & RTF_ADDRCONF) && rt->fib6_nh.fib_nh_gw_family; + return !(rt->fib6_flags & RTF_ADDRCONF) && + rt->fib6_nh->fib_nh_gw_family; } static struct fib6_info * @@ -4987,8 +4988,8 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp, const struct fib6_info *rt, enum mlxsw_sp_ipip_type *ret) { - return rt->fib6_nh.fib_nh_dev && - mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh.fib_nh_dev, ret); + return rt->fib6_nh->fib_nh_dev && + mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh->fib_nh_dev, ret); } static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp, @@ -4998,7 +4999,7 @@ static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp, { const struct mlxsw_sp_ipip_ops *ipip_ops; struct mlxsw_sp_ipip_entry *ipip_entry; - struct net_device *dev = rt->fib6_nh.fib_nh_dev; + struct net_device *dev = rt->fib6_nh->fib_nh_dev; struct mlxsw_sp_rif *rif; int err; @@ -5041,11 +5042,11 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop *nh, const struct fib6_info *rt) { - struct net_device *dev = rt->fib6_nh.fib_nh_dev; + struct net_device *dev = rt->fib6_nh->fib_nh_dev; nh->nh_grp = nh_grp; - nh->nh_weight = rt->fib6_nh.fib_nh_weight; - memcpy(&nh->gw_addr, &rt->fib6_nh.fib_nh_gw6, sizeof(nh->gw_addr)); + nh->nh_weight = rt->fib6_nh->fib_nh_weight; + memcpy(&nh->gw_addr, &rt->fib6_nh->fib_nh_gw6, sizeof(nh->gw_addr)); mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh); list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list); @@ -5068,7 +5069,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp, static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp, const struct fib6_info *rt) { - return rt->fib6_nh.fib_nh_gw_family || + return rt->fib6_nh->fib_nh_gw_family || mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL); } diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6b4852cf2fc24..ebe5d65f97e09 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -169,8 +169,8 @@ struct fib6_info { fib6_destroying:1, unused:3; - struct fib6_nh fib6_nh; struct rcu_head rcu; + struct fib6_nh fib6_nh[0]; }; struct rt6_info { @@ -280,7 +280,7 @@ static inline void ip6_rt_put(struct rt6_info *rt) dst_release(&rt->dst); } -struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh); void fib6_info_destroy_rcu(struct rcu_head *head); static inline void fib6_info_hold(struct fib6_info *f6i) @@ -443,7 +443,7 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr) static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) { - return f6i->fib6_nh.fib_nh_dev; + return f6i->fib6_nh->fib_nh_dev; } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 4790beaa86e01..a6ce6ea856b94 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -70,7 +70,7 @@ static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { /* the RTF_ADDRCONF flag filters out RA's */ return !(f6i->fib6_flags & RTF_ADDRCONF) && - f6i->fib6_nh.fib_nh_gw_family; + f6i->fib6_nh->fib_nh_gw_family; } void ip6_route_input(struct sk_buff *skb); @@ -275,7 +275,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { - struct fib6_nh *nha = &a->fib6_nh, *nhb = &b->fib6_nh; + struct fib6_nh *nha = a->fib6_nh, *nhb = b->fib6_nh; return nha->fib_nh_dev == nhb->fib_nh_dev && ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) && diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4bc35dd02b568..683613e7355bc 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2421,9 +2421,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) + if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex) continue; - if (no_gw && rt->fib6_nh.fib_nh_gw_family) + if (no_gw && rt->fib6_nh->fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; @@ -6341,7 +6341,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct fib6_nh *nh = &ifa->rt->fib6_nh; + struct fib6_nh *nh = ifa->rt->fib6_nh; int cpu; rcu_read_lock(); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 87ac82f850d29..cdfb8500ccae3 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -147,11 +147,15 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; + size_t sz = sizeof(*f6i); - f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (with_fib6_nh) + sz += sizeof(struct fib6_nh); + + f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; @@ -167,7 +171,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head) WARN_ON(f6i->fib6_node); - fib6_nh_release(&f6i->fib6_nh); + fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } @@ -912,7 +916,7 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ - fib6_nh = &f6i->fib6_nh; + fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i, table); } @@ -2301,14 +2305,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_nh.fib_nh_gw_family) { + if (rt->fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; - seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); + seq_printf(seq, "%pi6", &rt->fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } - dev = rt->fib6_nh.fib_nh_dev; + dev = rt->fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 4c8e2ea8bf193..f874dde1ee85b 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1293,8 +1293,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, @@ -1323,8 +1323,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b01118a3c42e8..f248ce8071168 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -441,12 +441,12 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) + if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { - const struct fib6_nh *nh = &sibling->fib6_nh; + const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); @@ -460,7 +460,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, out: res->f6i = match; - res->nh = &match->fib6_nh; + res->nh = match->fib6_nh; } /* @@ -496,13 +496,13 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { - nh = &f6i->fib6_nh; + nh = f6i->fib6_nh; if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { - nh = &spf6i->fib6_nh; + nh = spf6i->fib6_nh; if (__rt6_device_match(net, nh, saddr, oif, flags)) { res->f6i = spf6i; goto out; @@ -511,14 +511,14 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; goto out; } - nh = &f6i->fib6_nh; + nh = f6i->fib6_nh; if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; } out: res->nh = nh; @@ -714,7 +714,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, if (fib6_check_expired(f6i)) continue; - nh = &f6i->fib6_nh; + nh = f6i->fib6_nh; if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { res->f6i = f6i; res->nh = nh; @@ -796,7 +796,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif, out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; - res->nh = &res->f6i->fib6_nh; + res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } @@ -1626,7 +1626,7 @@ static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) void rt6_flush_exceptions(struct fib6_info *f6i) { - fib6_nh_flush_exceptions(&f6i->fib6_nh, f6i); + fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); } /* Find cached rt in the hash table inside passed in rt @@ -1721,7 +1721,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) if (!from || !(rt->rt6i_flags & RTF_CACHE)) return -EINVAL; - return fib6_nh_remove_exception(&from->fib6_nh, + return fib6_nh_remove_exception(from->fib6_nh, from->fib6_src.plen, rt); } @@ -1761,7 +1761,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) if (!from || !(rt->rt6i_flags & RTF_CACHE)) goto unlock; - fib6_nh_update_exception(&from->fib6_nh, from->fib6_src.plen, rt); + fib6_nh_update_exception(from->fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } @@ -1927,7 +1927,7 @@ void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now) { - fib6_nh_age_exceptions(&f6i->fib6_nh, gc_args, now); + fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); } /* must be called with rcu lock held */ @@ -2456,7 +2456,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, rcu_read_unlock(); return; } - res.nh = &res.f6i->fib6_nh; + res.nh = res.f6i->fib6_nh; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; @@ -2599,7 +2599,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; - res.nh = &rt->fib6_nh; + res.nh = rt->fib6_nh; if (fib6_check_expired(rt)) continue; @@ -2623,7 +2623,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, } res.f6i = rt; - res.nh = &rt->fib6_nh; + res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); @@ -3264,7 +3264,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags); + rt = fib6_info_alloc(gfp_flags, true); if (!rt) goto out; @@ -3304,7 +3304,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) goto out; @@ -3312,7 +3312,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, * they would result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type)) rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; if (!ipv6_addr_any(&cfg->fc_prefsrc)) { @@ -3472,7 +3472,7 @@ static int ip6_route_del(struct fib6_config *cfg, for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; - nh = &rt->fib6_nh; + nh = rt->fib6_nh; if (cfg->fc_flags & RTF_CACHE) { struct fib6_result res = { .f6i = rt, @@ -3614,7 +3614,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (!res.f6i) goto out; - res.nh = &res.f6i->fib6_nh; + res.nh = res.f6i->fib6_nh; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); @@ -3666,12 +3666,12 @@ static struct fib6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) + if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || - !rt->fib6_nh.fib_nh_gw_family) + !rt->fib6_nh->fib_nh_gw_family) continue; - if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; @@ -3729,7 +3729,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - struct fib6_nh *nh = &rt->fib6_nh; + struct fib6_nh *nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && @@ -3981,7 +3981,7 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && + if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -4009,7 +4009,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; - struct fib6_nh *nh = &rt->fib6_nh; + struct fib6_nh *nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) @@ -4059,9 +4059,9 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || - (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && - ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) + if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || + (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && + ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; @@ -4073,11 +4073,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->fib6_nh.fib_nh_weight; + total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->fib6_nh.fib_nh_weight; + total += iter->fib6_nh->fib_nh_weight; } return total; @@ -4088,11 +4088,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->fib6_nh.fib_nh_weight; + *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) @@ -4136,8 +4136,8 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) struct net *net = dev_net(arg->dev); if (rt != net->ipv6.fib6_null_entry && - rt->fib6_nh.fib_nh_dev == arg->dev) { - rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; + rt->fib6_nh->fib_nh_dev == arg->dev) { + rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -4165,10 +4165,10 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) + if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) + if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; @@ -4189,12 +4189,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, struct fib6_info *iter; unsigned int dead = 0; - if (rt->fib6_nh.fib_nh_dev == down_dev || - rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh->fib_nh_dev == down_dev || + rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == down_dev || - iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh->fib_nh_dev == down_dev || + iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -4206,11 +4206,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) - rt->fib6_nh.fib_nh_flags |= nh_flags; + if (rt->fib6_nh->fib_nh_dev == dev) + rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) - iter->fib6_nh.fib_nh_flags |= nh_flags; + if (iter->fib6_nh->fib_nh_dev == dev) + iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -4225,12 +4225,12 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) switch (arg->event) { case NETDEV_UNREGISTER: - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -4246,10 +4246,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->fib6_nh.fib_nh_dev != dev || + if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -4331,7 +4331,7 @@ static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) return 0; arg->f6i = f6i; - return fib6_nh_mtu_change(&f6i->fib6_nh, arg); + return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) @@ -4611,7 +4611,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); @@ -4778,7 +4778,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); + + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws); nexthop_len *= rt->fib6_nsiblings; } @@ -4796,7 +4796,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) + + lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws) + nexthop_len; } @@ -4916,14 +4916,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (!mp) goto nla_put_failure; - if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, - rt->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, + rt->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { - if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, - sibling->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, + sibling->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; } @@ -4931,7 +4931,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, } else { unsigned char nh_flags = 0; - if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, + if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, &nh_flags, false) < 0) goto nla_put_failure; @@ -4961,7 +4961,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { - if (f6i->fib6_nh.fib_nh_dev == dev) + if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { @@ -4969,7 +4969,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh.fib_nh_dev == dev) + if (sibling->fib6_nh->fib_nh_dev == dev) return true; } } @@ -5290,7 +5290,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { - net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; + net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5484,11 +5484,11 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; - net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, - sizeof(*net->ipv6.fib6_null_entry), - GFP_KERNEL); + net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; + memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), @@ -5625,7 +5625,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES From 0fa6efc54780412c3fbd6f6e61a5dfb4ca9bb59c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 May 2019 20:28:00 -0700 Subject: [PATCH 6/7] ipv6: Refactor ip6_route_del for cached routes Move the removal of cached routes to a helper, ip6_del_cached_rt, that can be invoked per nexthop. Rename the existig ip6_del_cached_rt to __ip6_del_cached_rt since it is called by ip6_del_cached_rt. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f248ce8071168..fdf5989268839 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3430,7 +3430,7 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) return err; } -static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; @@ -3446,10 +3446,25 @@ static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) return rc; } +static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, + struct fib6_nh *nh) +{ + struct fib6_result res = { + .f6i = rt, + .nh = nh, + }; + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); + if (rt_cache) + return __ip6_del_cached_rt(rt_cache, cfg); + + return 0; +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_cache; struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; @@ -3474,21 +3489,12 @@ static int ip6_route_del(struct fib6_config *cfg, nh = rt->fib6_nh; if (cfg->fc_flags & RTF_CACHE) { - struct fib6_result res = { - .f6i = rt, - .nh = nh, - }; int rc; - rt_cache = rt6_find_cached_rt(&res, - &cfg->fc_dst, - &cfg->fc_src); - if (rt_cache) { - rc = ip6_del_cached_rt(rt_cache, cfg); - if (rc != -ESRCH) { - rcu_read_unlock(); - return rc; - } + rc = ip6_del_cached_rt(cfg, rt, nh); + if (rc != -ESRCH) { + rcu_read_unlock(); + return rc; } continue; } From ec8105352869766569f657464c8a78d7bc0e2406 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 May 2019 20:28:01 -0700 Subject: [PATCH 7/7] selftests: Add redirect tests Add test for ICMP redirects and exception processing. Test is setup for later addition of tests using nexthop objects for routing. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/icmp_redirect.sh | 455 +++++++++++++++++++ 1 file changed, 455 insertions(+) create mode 100755 tools/testing/selftests/net/icmp_redirect.sh diff --git a/tools/testing/selftests/net/icmp_redirect.sh b/tools/testing/selftests/net/icmp_redirect.sh new file mode 100755 index 0000000000000..e8d8496f5b16c --- /dev/null +++ b/tools/testing/selftests/net/icmp_redirect.sh @@ -0,0 +1,455 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# redirect test +# +# .253 +----+ +# +----| r1 | +# | +----+ +# +----+ | |.1 +# | h1 |--------------+ | 10.1.1.0/30 2001:db8:1::0/126 +# +----+ .1 | |.2 +# 172.16.1/24 | +----+ +----+ +# 2001:db8:16:1/64 +----| r2 |-------------------| h2 | +# .254 +----+ .254 .2 +----+ +# 172.16.2/24 +# 2001:db8:16:2/64 +# +# Route from h1 to h2 goes through r1, eth1 - connection between r1 and r2. +# Route on r1 changed to go to r2 via eth0. This causes a redirect to be sent +# from r1 to h1 telling h1 to use r2 when talking to h2. + +VERBOSE=0 +PAUSE_ON_FAIL=no + +H1_N1_IP=172.16.1.1 +R1_N1_IP=172.16.1.253 +R2_N1_IP=172.16.1.254 + +H1_N1_IP6=2001:db8:16:1::1 +R1_N1_IP6=2001:db8:16:1::253 +R2_N1_IP6=2001:db8:16:1::254 + +R1_R2_N1_IP=10.1.1.1 +R2_R1_N1_IP=10.1.1.2 + +R1_R2_N1_IP6=2001:db8:1::1 +R2_R1_N1_IP6=2001:db8:1::2 + +H2_N2=172.16.2.0/24 +H2_N2_6=2001:db8:16:2::/64 +H2_N2_IP=172.16.2.2 +R2_N2_IP=172.16.2.254 +H2_N2_IP6=2001:db8:16:2::2 +R2_N2_IP6=2001:db8:16:2::254 + +VRF=red +VRF_TABLE=1111 + +################################################################################ +# helpers + +log_section() +{ + echo + echo "###########################################################################" + echo "$*" + echo "###########################################################################" + echo +} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +run_cmd() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + fi + + out=$(eval $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" -a -n "$out" ]; then + echo "$out" + fi + + [ "$VERBOSE" = "1" ] && echo + + return $rc +} + +get_linklocal() +{ + local ns=$1 + local dev=$2 + local addr + + addr=$(ip -netns $ns -6 -br addr show dev ${dev} | \ + awk '{ + for (i = 3; i <= NF; ++i) { + if ($i ~ /^fe80/) + print $i + } + }' + ) + addr=${addr/\/*} + + [ -z "$addr" ] && return 1 + + echo $addr + + return 0 +} + +################################################################################ +# setup and teardown + +cleanup() +{ + local ns + + for ns in h1 h2 r1 r2; do + ip netns del $ns 2>/dev/null + done +} + +create_vrf() +{ + local ns=$1 + + ip -netns ${ns} link add ${VRF} type vrf table ${VRF_TABLE} + ip -netns ${ns} link set ${VRF} up + ip -netns ${ns} route add vrf ${VRF} unreachable default metric 8192 + ip -netns ${ns} -6 route add vrf ${VRF} unreachable default metric 8192 + + ip -netns ${ns} addr add 127.0.0.1/8 dev ${VRF} + ip -netns ${ns} -6 addr add ::1 dev ${VRF} nodad + + ip -netns ${ns} ru del pref 0 + ip -netns ${ns} ru add pref 32765 from all lookup local + ip -netns ${ns} -6 ru del pref 0 + ip -netns ${ns} -6 ru add pref 32765 from all lookup local +} + +setup() +{ + local ns + + # + # create nodes as namespaces + # + for ns in h1 h2 r1 r2; do + ip netns add $ns + ip -netns $ns li set lo up + + case "${ns}" in + h[12]) ip netns exec $ns sysctl -q -w net.ipv4.conf.all.accept_redirects=1 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.accept_redirects=1 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1 + ;; + r[12]) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1 + ip netns exec $ns sysctl -q -w net.ipv4.conf.all.send_redirects=1 + + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1 + ip netns exec $ns sysctl -q -w net.ipv6.route.mtu_expires=10 + esac + done + + # + # create interconnects + # + ip -netns h1 li add eth0 type veth peer name r1h1 + ip -netns h1 li set r1h1 netns r1 name eth0 up + + ip -netns h1 li add eth1 type veth peer name r2h1 + ip -netns h1 li set r2h1 netns r2 name eth0 up + + ip -netns h2 li add eth0 type veth peer name r2h2 + ip -netns h2 li set eth0 up + ip -netns h2 li set r2h2 netns r2 name eth2 up + + ip -netns r1 li add eth1 type veth peer name r2r1 + ip -netns r1 li set eth1 up + ip -netns r1 li set r2r1 netns r2 name eth1 up + + # + # h1 + # + if [ "${WITH_VRF}" = "yes" ]; then + create_vrf "h1" + H1_VRF_ARG="vrf ${VRF}" + H1_PING_ARG="-I ${VRF}" + else + H1_VRF_ARG= + H1_PING_ARG= + fi + ip -netns h1 li add br0 type bridge + if [ "${WITH_VRF}" = "yes" ]; then + ip -netns h1 li set br0 vrf ${VRF} up + else + ip -netns h1 li set br0 up + fi + ip -netns h1 addr add dev br0 ${H1_N1_IP}/24 + ip -netns h1 -6 addr add dev br0 ${H1_N1_IP6}/64 nodad + ip -netns h1 li set eth0 master br0 up + ip -netns h1 li set eth1 master br0 up + + # + # h2 + # + ip -netns h2 addr add dev eth0 ${H2_N2_IP}/24 + ip -netns h2 ro add default via ${R2_N2_IP} dev eth0 + ip -netns h2 -6 addr add dev eth0 ${H2_N2_IP6}/64 nodad + ip -netns h2 -6 ro add default via ${R2_N2_IP6} dev eth0 + + # + # r1 + # + ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24 + ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad + ip -netns r1 addr add dev eth1 ${R1_R2_N1_IP}/30 + ip -netns r1 -6 addr add dev eth1 ${R1_R2_N1_IP6}/126 nodad + + # + # r2 + # + ip -netns r2 addr add dev eth0 ${R2_N1_IP}/24 + ip -netns r2 -6 addr add dev eth0 ${R2_N1_IP6}/64 nodad + ip -netns r2 addr add dev eth1 ${R2_R1_N1_IP}/30 + ip -netns r2 -6 addr add dev eth1 ${R2_R1_N1_IP6}/126 nodad + ip -netns r2 addr add dev eth2 ${R2_N2_IP}/24 + ip -netns r2 -6 addr add dev eth2 ${R2_N2_IP6}/64 nodad + + sleep 2 + + R1_LLADDR=$(get_linklocal r1 eth0) + if [ $? -ne 0 ]; then + echo "Error: Failed to get link-local address of r1's eth0" + exit 1 + fi + + R2_LLADDR=$(get_linklocal r2 eth0) + if [ $? -ne 0 ]; then + echo "Error: Failed to get link-local address of r2's eth0" + exit 1 + fi +} + +change_h2_mtu() +{ + local mtu=$1 + + run_cmd ip -netns h2 li set eth0 mtu ${mtu} + run_cmd ip -netns r2 li set eth2 mtu ${mtu} +} + +check_exception() +{ + local mtu="$1" + local with_redirect="$2" + local desc="$3" + + # From 172.16.1.101: icmp_seq=1 Redirect Host(New nexthop: 172.16.1.102) + if [ "$VERBOSE" = "1" ]; then + echo "Commands to check for exception:" + run_cmd ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} + run_cmd ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} + fi + + if [ -n "${mtu}" ]; then + mtu=" mtu ${mtu}" + fi + if [ "$with_redirect" = "yes" ]; then + ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \ + grep -q "cache expires [0-9]*sec${mtu}" + elif [ -n "${mtu}" ]; then + ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \ + grep -q "cache expires [0-9]*sec${mtu}" + else + ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \ + grep -q "cache" + fi + log_test $? 0 "IPv4: ${desc}" + + if [ "$with_redirect" = "yes" ]; then + ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | grep -q "${H2_N2_IP6} from :: via ${R2_LLADDR} dev br0.*${mtu}" + else + ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | grep -q "${mtu}" + fi + log_test $? 0 "IPv6: ${desc}" +} + +run_ping() +{ + local sz=$1 + + run_cmd ip netns exec h1 ping -q -M want -i 0.2 -c 10 -w 2 -s ${sz} ${H1_PING_ARG} ${H2_N2_IP} + run_cmd ip netns exec h1 ${ping6} -q -M want -i 0.2 -c 10 -w 2 -s ${sz} ${H1_PING_ARG} ${H2_N2_IP6} +} + +replace_route_legacy() +{ + # r1 to h2 via r2 and eth0 + run_cmd ip -netns r1 ro replace ${H2_N2} via ${R2_N1_IP} dev eth0 + run_cmd ip -netns r1 -6 ro replace ${H2_N2_6} via ${R2_LLADDR} dev eth0 +} + +initial_route_legacy() +{ + # r1 to h2 via r2 and eth1 + run_cmd ip -netns r1 ro add ${H2_N2} via ${R2_R1_N1_IP} dev eth1 + run_cmd ip -netns r1 -6 ro add ${H2_N2_6} via ${R2_R1_N1_IP6} dev eth1 + + # h1 to h2 via r1 + # - IPv6 redirect only works if gateway is the LLA + run_cmd ip -netns h1 ro add ${H1_VRF_ARG} ${H2_N2} via ${R1_N1_IP} dev br0 + run_cmd ip -netns h1 -6 ro add ${H1_VRF_ARG} ${H2_N2_6} via ${R1_LLADDR} dev br0 +} + +check_connectivity() +{ + local rc + + run_cmd ip netns exec h1 ping -c1 -w1 ${H1_PING_ARG} ${H2_N2_IP} + rc=$? + run_cmd ip netns exec h1 ${ping6} -c1 -w1 ${H1_PING_ARG} ${H2_N2_IP6} + [ $? -ne 0 ] && rc=$? + + return $rc +} + +do_test() +{ + local ttype="$1" + + eval initial_route_${ttype} + + # verify connectivity + check_connectivity + if [ $? -ne 0 ]; then + echo "Error: Basic connectivity is broken" + ret=1 + return + fi + + # redirect exception followed by mtu + eval replace_route_${ttype} + run_ping 64 + check_exception "" "yes" "redirect exception" + + check_connectivity + if [ $? -ne 0 ]; then + echo "Error: Basic connectivity is broken after redirect" + ret=1 + return + fi + + change_h2_mtu 1300 + run_ping 1350 + check_exception "1300" "yes" "redirect exception plus mtu" + + # remove exceptions and restore routing + change_h2_mtu 1500 + ip -netns h1 li set br0 down + ip -netns h1 li set br0 up + eval initial_route_${ttype} + + check_connectivity + if [ $? -ne 0 ]; then + echo "Error: Basic connectivity is broken after reset" + ret=1 + return + fi + check_exception "" "no" "routing reset" + + # MTU exception followed by redirect + change_h2_mtu 1300 + run_ping 1350 + check_exception "1300" "no" "mtu exception" + + eval replace_route_${ttype} + run_ping 64 + check_exception "1300" "yes" "mtu exception plus redirect" + + check_connectivity + if [ $? -ne 0 ]; then + echo "Error: Basic connectivity is broken after redirect" + ret=1 + return + fi +} + +################################################################################ +# usage + +usage() +{ + cat < /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + +ret=0 +nsuccess=0 +nfail=0 + +while getopts :pv o +do + case $o in + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=$(($VERBOSE + 1));; + *) usage; exit 1;; + esac +done + +trap cleanup EXIT + +cleanup +WITH_VRF=no +setup + +log_section "Legacy routing" +do_test "legacy" + +cleanup +log_section "Legacy routing with VRF" +WITH_VRF=yes +setup +do_test "legacy" + +printf "\nTests passed: %3d\n" ${nsuccess} +printf "Tests failed: %3d\n" ${nfail} + +exit $ret