From e5bf1c39e894d754092ff55b06035ed222a359ec Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:17 -0800 Subject: [PATCH 01/12] ipv4: fib: Use cached net in fib_inetaddr_event(). net is available in fib_inetaddr_event(), let's use it. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 272e42d813230..6730e2034cf8e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1450,7 +1450,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev)); + rt_cache_flush(net); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); @@ -1461,7 +1461,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, */ fib_disable_ip(dev, event, true); } else { - rt_cache_flush(dev_net(dev)); + rt_cache_flush(net); } break; } From fa336adc100e1d76a50da782c3e9abb1541a72f3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:18 -0800 Subject: [PATCH 02/12] ipv4: fib: Allocate fib_info_hash[] and fib_info_laddrhash[] by kvcalloc(). Both fib_info_hash[] and fib_info_laddrhash[] are hash tables for struct fib_info and are allocated by kvzmalloc() separately. Let's replace the two kvzmalloc() calls with kvcalloc() to remove the fib_info_laddrhash pointer later. Note that fib_info_hash_alloc() allocates a new hash table based on fib_info_hash_bits because we will remove fib_info_hash_size later. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 43 +++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d2cee5c314f5e..23aae379ba423 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -357,6 +357,18 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi) return fib_info_hashfn_result(fi->fib_net, val); } +static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) +{ + /* The second half is used for prefsrc */ + return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *), + GFP_KERNEL); +} + +static void fib_info_hash_free(struct hlist_head *head) +{ + kvfree(head); +} + /* no metrics, only nexthop id */ static struct fib_info *fib_find_info_nh(struct net *net, const struct fib_config *cfg) @@ -1249,9 +1261,9 @@ fib_info_laddrhash_bucket(const struct net *net, __be32 val) } static void fib_info_hash_move(struct hlist_head *new_info_hash, - struct hlist_head *new_laddrhash, unsigned int new_size) { + struct hlist_head *new_laddrhash = new_info_hash + new_size; struct hlist_head *old_info_hash, *old_laddrhash; unsigned int old_size = fib_info_hash_size; unsigned int i; @@ -1293,8 +1305,7 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, } } - kvfree(old_info_hash); - kvfree(old_laddrhash); + fib_info_hash_free(old_info_hash); } __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, @@ -1412,22 +1423,18 @@ struct fib_info *fib_create_info(struct fib_config *cfg, err = -ENOBUFS; if (fib_info_cnt >= fib_info_hash_size) { - unsigned int new_size = fib_info_hash_size << 1; struct hlist_head *new_info_hash; - struct hlist_head *new_laddrhash; - size_t bytes; - - if (!new_size) - new_size = 16; - bytes = (size_t)new_size * sizeof(struct hlist_head *); - new_info_hash = kvzalloc(bytes, GFP_KERNEL); - new_laddrhash = kvzalloc(bytes, GFP_KERNEL); - if (!new_info_hash || !new_laddrhash) { - kvfree(new_info_hash); - kvfree(new_laddrhash); - } else { - fib_info_hash_move(new_info_hash, new_laddrhash, new_size); - } + unsigned int new_hash_bits; + + if (!fib_info_hash_bits) + new_hash_bits = 4; + else + new_hash_bits = fib_info_hash_bits + 1; + + new_info_hash = fib_info_hash_alloc(new_hash_bits); + if (new_info_hash) + fib_info_hash_move(new_info_hash, 1 << new_hash_bits); + if (!fib_info_hash_size) goto failure; } From cfc47029fa124e5e04411fb1dbd3726db90fd346 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:19 -0800 Subject: [PATCH 03/12] ipv4: fib: Allocate fib_info_hash[] during netns initialisation. We will allocate fib_info_hash[] and fib_info_laddrhash[] for each netns. Currently, fib_info_hash[] is allocated when the first route is added. Let's move the first allocation to a new __net_init function. Note that we must call fib4_semantics_exit() in fib_net_exit_batch() because ->exit() is called earlier than ->exit_batch(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 2 ++ net/ipv4/fib_frontend.c | 11 ++++++++++ net/ipv4/fib_semantics.c | 45 ++++++++++++++++++++++++++++------------ 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a113c11ab56b5..e3864b74e92a6 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -162,6 +162,8 @@ struct fib_info { struct fib_nh fib_nh[] __counted_by(fib_nhs); }; +int __net_init fib4_semantics_init(struct net *net); +void __net_exit fib4_semantics_exit(struct net *net); #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6730e2034cf8e..40c062f820f2d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1615,9 +1615,15 @@ static int __net_init fib_net_init(struct net *net) error = ip_fib_net_init(net); if (error < 0) goto out; + + error = fib4_semantics_init(net); + if (error) + goto out_semantics; + error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; + error = fib_proc_init(net); if (error < 0) goto out_proc; @@ -1627,6 +1633,8 @@ static int __net_init fib_net_init(struct net *net) out_proc: nl_fib_lookup_exit(net); out_nlfl: + fib4_semantics_exit(net); +out_semantics: rtnl_lock(); ip_fib_net_exit(net); rtnl_unlock(); @@ -1648,6 +1656,9 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list) ip_fib_net_exit(net); rtnl_unlock(); + + list_for_each_entry(net, net_list, exit_list) + fib4_semantics_exit(net); } static struct pernet_operations fib_net_ops = { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 23aae379ba423..b7e2023bf742d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1420,28 +1420,21 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - err = -ENOBUFS; - if (fib_info_cnt >= fib_info_hash_size) { + unsigned int new_hash_bits = fib_info_hash_bits + 1; struct hlist_head *new_info_hash; - unsigned int new_hash_bits; - - if (!fib_info_hash_bits) - new_hash_bits = 4; - else - new_hash_bits = fib_info_hash_bits + 1; new_info_hash = fib_info_hash_alloc(new_hash_bits); if (new_info_hash) fib_info_hash_move(new_info_hash, 1 << new_hash_bits); - - if (!fib_info_hash_size) - goto failure; } fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); - if (!fi) + if (!fi) { + err = -ENOBUFS; goto failure; + } + fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); @@ -1862,7 +1855,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) struct fib_info *fi; int ret = 0; - if (!fib_info_laddrhash || local == 0) + if (!local) return 0; head = fib_info_laddrhash_bucket(net, local); @@ -2264,3 +2257,29 @@ void fib_select_path(struct net *net, struct fib_result *res, fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); } } + +int __net_init fib4_semantics_init(struct net *net) +{ + unsigned int hash_bits = 4; + + if (!net_eq(net, &init_net)) + return 0; + + fib_info_hash = fib_info_hash_alloc(hash_bits); + if (!fib_info_hash) + return -ENOMEM; + + fib_info_hash_bits = hash_bits; + fib_info_hash_size = 1 << hash_bits; + fib_info_laddrhash = fib_info_hash + fib_info_hash_size; + + return 0; +} + +void __net_exit fib4_semantics_exit(struct net *net) +{ + if (!net_eq(net, &init_net)) + return; + + fib_info_hash_free(fib_info_hash); +} From 84c75e94ecee7d783eb1d4282a66d6d26d848246 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:20 -0800 Subject: [PATCH 04/12] ipv4: fib: Make fib_info_hashfn() return struct hlist_head. Every time fib_info_hashfn() returns a hash value, we fetch &fib_info_hash[hash]. Let's return the hlist_head pointer from fib_info_hashfn() and rename it to fib_info_hash_bucket() to match a similar function, fib_info_laddrhash_bucket(). Note that we need to move the fib_info_hash assignment earlier in fib_info_hash_move() to use fib_info_hash_bucket() in the for loop. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b7e2023bf742d..54749dd7afc8a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -338,7 +338,7 @@ static unsigned int fib_info_hashfn_result(const struct net *net, return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits); } -static inline unsigned int fib_info_hashfn(struct fib_info *fi) +static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { unsigned int val; @@ -354,7 +354,7 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi) } endfor_nexthops(fi) } - return fib_info_hashfn_result(fi->fib_net, val); + return &fib_info_hash[fib_info_hashfn_result(fi->fib_net, val)]; } static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) @@ -404,12 +404,8 @@ static struct fib_info *fib_find_info_nh(struct net *net, static struct fib_info *fib_find_info(struct fib_info *nfi) { - struct hlist_head *head; + struct hlist_head *head = fib_info_hash_bucket(nfi); struct fib_info *fi; - unsigned int hash; - - hash = fib_info_hashfn(nfi); - head = &fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { if (!net_eq(fi->fib_net, nfi->fib_net)) @@ -1273,22 +1269,16 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, old_laddrhash = fib_info_laddrhash; fib_info_hash_size = new_size; fib_info_hash_bits = ilog2(new_size); + fib_info_hash = new_info_hash; for (i = 0; i < old_size; i++) { - struct hlist_head *head = &fib_info_hash[i]; + struct hlist_head *head = &old_info_hash[i]; struct hlist_node *n; struct fib_info *fi; - hlist_for_each_entry_safe(fi, n, head, fib_hash) { - struct hlist_head *dest; - unsigned int new_hash; - - new_hash = fib_info_hashfn(fi); - dest = &new_info_hash[new_hash]; - hlist_add_head(&fi->fib_hash, dest); - } + hlist_for_each_entry_safe(fi, n, head, fib_hash) + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); } - fib_info_hash = new_info_hash; fib_info_laddrhash = new_laddrhash; for (i = 0; i < old_size; i++) { @@ -1572,8 +1562,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, refcount_set(&fi->fib_clntref, 1); fib_info_cnt++; - hlist_add_head(&fi->fib_hash, - &fib_info_hash[fib_info_hashfn(fi)]); + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); + if (fi->fib_prefsrc) { struct hlist_head *head; From 0dbca8c269ba786cde720d159f3c7dcbc2166f34 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:21 -0800 Subject: [PATCH 05/12] ipv4: fib: Remove fib_info_laddrhash pointer. We will allocate the fib_info hash tables per netns. There are 5 global variables for fib_info hash tables: fib_info_hash, fib_info_laddrhash, fib_info_hash_size, fib_info_hash_bits, fib_info_cnt. However, fib_info_laddrhash and fib_info_hash_size can be easily calculated from fib_info_hash and fib_info_hash_bits. Let's remove the fib_info_laddrhash pointer and instead use fib_info_hash + (1 << fib_info_hash_bits). While at it, fib_info_laddrhash_bucket() is moved near other hash-table-specific functions. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 54749dd7afc8a..6cc518dd665f9 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -51,7 +51,6 @@ #include "fib_lookup.h" static struct hlist_head *fib_info_hash; -static struct hlist_head *fib_info_laddrhash; static unsigned int fib_info_hash_size; static unsigned int fib_info_hash_bits; static unsigned int fib_info_cnt; @@ -357,6 +356,15 @@ static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) return &fib_info_hash[fib_info_hashfn_result(fi->fib_net, val)]; } +static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, + __be32 val) +{ + u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, + fib_info_hash_bits); + + return &fib_info_hash[(1 << fib_info_hash_bits) + slot]; +} + static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) { /* The second half is used for prefsrc */ @@ -1247,26 +1255,15 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, return err; } -static struct hlist_head * -fib_info_laddrhash_bucket(const struct net *net, __be32 val) -{ - u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, - fib_info_hash_bits); - - return &fib_info_laddrhash[slot]; -} - static void fib_info_hash_move(struct hlist_head *new_info_hash, unsigned int new_size) { - struct hlist_head *new_laddrhash = new_info_hash + new_size; - struct hlist_head *old_info_hash, *old_laddrhash; unsigned int old_size = fib_info_hash_size; + struct hlist_head *old_info_hash; unsigned int i; ASSERT_RTNL(); old_info_hash = fib_info_hash; - old_laddrhash = fib_info_laddrhash; fib_info_hash_size = new_size; fib_info_hash_bits = ilog2(new_size); fib_info_hash = new_info_hash; @@ -1280,9 +1277,8 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); } - fib_info_laddrhash = new_laddrhash; for (i = 0; i < old_size; i++) { - struct hlist_head *lhead = &old_laddrhash[i]; + struct hlist_head *lhead = &old_info_hash[old_size + i]; struct hlist_node *n; struct fib_info *fi; @@ -2261,7 +2257,6 @@ int __net_init fib4_semantics_init(struct net *net) fib_info_hash_bits = hash_bits; fib_info_hash_size = 1 << hash_bits; - fib_info_laddrhash = fib_info_hash + fib_info_hash_size; return 0; } From d6306b9d98850a1a7247eb10fe5bb78506ee343b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:22 -0800 Subject: [PATCH 06/12] ipv4: fib: Remove fib_info_hash_size. We will allocate the fib_info hash tables per netns. There are 5 global variables for fib_info hash tables: fib_info_hash, fib_info_laddrhash, fib_info_hash_size, fib_info_hash_bits, fib_info_cnt. However, fib_info_laddrhash and fib_info_hash_size can be easily calculated from fib_info_hash and fib_info_hash_bits. Let's remove fib_info_hash_size and use (1 << fib_info_hash_bits) instead. Now we need not pass the new hash table size to fib_info_hash_move(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6cc518dd665f9..9dc09e80b92be 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -51,7 +51,6 @@ #include "fib_lookup.h" static struct hlist_head *fib_info_hash; -static unsigned int fib_info_hash_size; static unsigned int fib_info_hash_bits; static unsigned int fib_info_cnt; @@ -1255,17 +1254,15 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, return err; } -static void fib_info_hash_move(struct hlist_head *new_info_hash, - unsigned int new_size) +static void fib_info_hash_move(struct hlist_head *new_info_hash) { - unsigned int old_size = fib_info_hash_size; + unsigned int old_size = 1 << fib_info_hash_bits; struct hlist_head *old_info_hash; unsigned int i; ASSERT_RTNL(); old_info_hash = fib_info_hash; - fib_info_hash_size = new_size; - fib_info_hash_bits = ilog2(new_size); + fib_info_hash_bits += 1; fib_info_hash = new_info_hash; for (i = 0; i < old_size; i++) { @@ -1406,13 +1403,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - if (fib_info_cnt >= fib_info_hash_size) { - unsigned int new_hash_bits = fib_info_hash_bits + 1; + if (fib_info_cnt >= (1 << fib_info_hash_bits)) { struct hlist_head *new_info_hash; - new_info_hash = fib_info_hash_alloc(new_hash_bits); + new_info_hash = fib_info_hash_alloc(fib_info_hash_bits + 1); if (new_info_hash) - fib_info_hash_move(new_info_hash, 1 << new_hash_bits); + fib_info_hash_move(new_info_hash); } fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); @@ -2256,7 +2252,6 @@ int __net_init fib4_semantics_init(struct net *net) return -ENOMEM; fib_info_hash_bits = hash_bits; - fib_info_hash_size = 1 << hash_bits; return 0; } From b79bcaf7d95261bb8c205fa6826100ca4ed961c1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:23 -0800 Subject: [PATCH 07/12] ipv4: fib: Add fib_info_hash_grow(). When the number of struct fib_info exceeds the hash table size in fib_create_info(), we try to allocate a new hash table with the doubled size. The allocation is done in fib_create_info(), and if successful, each struct fib_info is moved to the new hash table by fib_info_hash_move(). Let's integrate the allocation and fib_info_hash_move() as fib_info_hash_grow() to make the following change cleaner. While at it, fib_info_hash_grow() is placed near other hash-table-specific functions. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 85 +++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 9dc09e80b92be..0cd40ff18d8be 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -376,6 +376,46 @@ static void fib_info_hash_free(struct hlist_head *head) kvfree(head); } +static void fib_info_hash_grow(void) +{ + struct hlist_head *new_info_hash, *old_info_hash; + unsigned int old_size = 1 << fib_info_hash_bits; + unsigned int i; + + if (fib_info_cnt < old_size) + return; + + new_info_hash = fib_info_hash_alloc(fib_info_hash_bits + 1); + if (!new_info_hash) + return; + + old_info_hash = fib_info_hash; + fib_info_hash = new_info_hash; + fib_info_hash_bits += 1; + + for (i = 0; i < old_size; i++) { + struct hlist_head *head = &old_info_hash[i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, head, fib_hash) + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); + } + + for (i = 0; i < old_size; i++) { + struct hlist_head *lhead = &old_info_hash[old_size + i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) + hlist_add_head(&fi->fib_lhash, + fib_info_laddrhash_bucket(fi->fib_net, + fi->fib_prefsrc)); + } + + fib_info_hash_free(old_info_hash); +} + /* no metrics, only nexthop id */ static struct fib_info *fib_find_info_nh(struct net *net, const struct fib_config *cfg) @@ -1254,43 +1294,6 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, return err; } -static void fib_info_hash_move(struct hlist_head *new_info_hash) -{ - unsigned int old_size = 1 << fib_info_hash_bits; - struct hlist_head *old_info_hash; - unsigned int i; - - ASSERT_RTNL(); - old_info_hash = fib_info_hash; - fib_info_hash_bits += 1; - fib_info_hash = new_info_hash; - - for (i = 0; i < old_size; i++) { - struct hlist_head *head = &old_info_hash[i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, head, fib_hash) - hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); - } - - for (i = 0; i < old_size; i++) { - struct hlist_head *lhead = &old_info_hash[old_size + i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { - struct hlist_head *ldest; - - ldest = fib_info_laddrhash_bucket(fi->fib_net, - fi->fib_prefsrc); - hlist_add_head(&fi->fib_lhash, ldest); - } - } - - fib_info_hash_free(old_info_hash); -} - __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { @@ -1403,13 +1406,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - if (fib_info_cnt >= (1 << fib_info_hash_bits)) { - struct hlist_head *new_info_hash; - - new_info_hash = fib_info_hash_alloc(fib_info_hash_bits + 1); - if (new_info_hash) - fib_info_hash_move(new_info_hash); - } + fib_info_hash_grow(); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) { From 9f7f3ebeba9388c30b0cba9dc5df0c43d0e07605 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:24 -0800 Subject: [PATCH 08/12] ipv4: fib: Namespacify fib_info hash tables. We will convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL. Then, we need to have per-netns hash tables for struct fib_info. Let's allocate the hash tables per netns. fib_info_hash, fib_info_hash_bits, and fib_info_cnt are now moved to struct netns_ipv4 and accessed with net->ipv4.fib_XXX. Also, the netns checks are removed from fib_find_info_nh() and fib_find_info(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 3 ++ net/ipv4/fib_semantics.c | 61 +++++++++++++++++----------------------- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 45ac125e8aebb..650b2dc9199f4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -111,6 +111,9 @@ struct netns_ipv4 { #endif struct hlist_head *fib_table_hash; struct sock *fibnl; + struct hlist_head *fib_info_hash; + unsigned int fib_info_hash_bits; + unsigned int fib_info_cnt; struct sock *mc_autojoin_sk; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0cd40ff18d8be..f68bb9e34c34c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -50,10 +50,6 @@ #include "fib_lookup.h" -static struct hlist_head *fib_info_hash; -static unsigned int fib_info_hash_bits; -static unsigned int fib_info_cnt; - /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ @@ -256,8 +252,7 @@ void fib_release_info(struct fib_info *fi) ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); - - fib_info_cnt--; + fi->fib_net->ipv4.fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -333,11 +328,12 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, static unsigned int fib_info_hashfn_result(const struct net *net, unsigned int val) { - return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits); + return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); } static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { + struct net *net = fi->fib_net; unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, @@ -352,16 +348,18 @@ static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) } endfor_nexthops(fi) } - return &fib_info_hash[fib_info_hashfn_result(fi->fib_net, val)]; + return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; } static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, __be32 val) { - u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, - fib_info_hash_bits); + unsigned int hash_bits = net->ipv4.fib_info_hash_bits; + u32 slot; - return &fib_info_hash[(1 << fib_info_hash_bits) + slot]; + slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); + + return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; } static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) @@ -376,22 +374,22 @@ static void fib_info_hash_free(struct hlist_head *head) kvfree(head); } -static void fib_info_hash_grow(void) +static void fib_info_hash_grow(struct net *net) { + unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; struct hlist_head *new_info_hash, *old_info_hash; - unsigned int old_size = 1 << fib_info_hash_bits; unsigned int i; - if (fib_info_cnt < old_size) + if (net->ipv4.fib_info_cnt < old_size) return; - new_info_hash = fib_info_hash_alloc(fib_info_hash_bits + 1); + new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); if (!new_info_hash) return; - old_info_hash = fib_info_hash; - fib_info_hash = new_info_hash; - fib_info_hash_bits += 1; + old_info_hash = net->ipv4.fib_info_hash; + net->ipv4.fib_info_hash = new_info_hash; + net->ipv4.fib_info_hash_bits += 1; for (i = 0; i < old_size; i++) { struct hlist_head *head = &old_info_hash[i]; @@ -429,13 +427,12 @@ static struct fib_info *fib_find_info_nh(struct net *net, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(net, hash); - head = &fib_info_hash[hash]; + head = &net->ipv4.fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, net)) - continue; if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; + if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && @@ -455,10 +452,9 @@ static struct fib_info *fib_find_info(struct fib_info *nfi) struct fib_info *fi; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, nfi->fib_net)) - continue; if (fi->fib_nhs != nfi->fib_nhs) continue; + if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && @@ -1406,7 +1402,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - fib_info_hash_grow(); + fib_info_hash_grow(net); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) { @@ -1550,7 +1546,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); - fib_info_cnt++; + net->ipv4.fib_info_cnt++; hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); if (fi->fib_prefsrc) { @@ -2241,22 +2237,17 @@ int __net_init fib4_semantics_init(struct net *net) { unsigned int hash_bits = 4; - if (!net_eq(net, &init_net)) - return 0; - - fib_info_hash = fib_info_hash_alloc(hash_bits); - if (!fib_info_hash) + net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); + if (!net->ipv4.fib_info_hash) return -ENOMEM; - fib_info_hash_bits = hash_bits; + net->ipv4.fib_info_hash_bits = hash_bits; + net->ipv4.fib_info_cnt = 0; return 0; } void __net_exit fib4_semantics_exit(struct net *net) { - if (!net_eq(net, &init_net)) - return; - - fib_info_hash_free(fib_info_hash); + fib_info_hash_free(net->ipv4.fib_info_hash); } From af5cd2a8f07842d14be2cbc628ea511adfd6bba9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:25 -0800 Subject: [PATCH 09/12] ipv4: fib: Hold rtnl_net_lock() for ip_fib_net_exit(). ip_fib_net_exit() requires RTNL and is called from fib_net_init() and fib_net_exit_batch(). Let's hold rtnl_net_lock() before ip_fib_net_exit(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-10-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 40c062f820f2d..c48ed369b1796 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1575,7 +1575,7 @@ static void ip_fib_net_exit(struct net *net) { int i; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); @@ -1635,9 +1635,9 @@ static int __net_init fib_net_init(struct net *net) out_nlfl: fib4_semantics_exit(net); out_semantics: - rtnl_lock(); + rtnl_net_lock(net); ip_fib_net_exit(net); - rtnl_unlock(); + rtnl_net_unlock(net); goto out; } @@ -1652,9 +1652,11 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list) struct net *net; rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) + list_for_each_entry(net, net_list, exit_list) { + __rtnl_net_lock(net); ip_fib_net_exit(net); - + __rtnl_net_unlock(net); + } rtnl_unlock(); list_for_each_entry(net, net_list, exit_list) From c0ebe1cdc2cff0dee092a67f2c50377bb5fcf43d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:26 -0800 Subject: [PATCH 10/12] ipv4: fib: Hold rtnl_net_lock() in ip_rt_ioctl(). ioctl(SIOCADDRT/SIOCDELRT) calls ip_rt_ioctl() to add/remove a route in the netns of the specified socket. Let's hold rtnl_net_lock() there. Note that rtentry_to_fib_config() can be called without rtnl_net_lock() if we convert rtentry.dev handling to RCU later. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c48ed369b1796..a76dacc3e577a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -553,18 +553,16 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, const struct in_ifaddr *ifa; struct in_device *in_dev; - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); if (!in_dev) return -ENODEV; *colon = ':'; - rcu_read_lock(); - in_dev_for_each_ifa_rcu(ifa, in_dev) { + in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } - rcu_read_unlock(); if (!ifa) return -ENODEV; @@ -635,7 +633,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - rtnl_lock(); + rtnl_net_lock(net); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; @@ -659,7 +657,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } - rtnl_unlock(); + rtnl_net_unlock(net); return err; } return -EINVAL; From 254ba7e6032d3fc738050d500b0c1d8197af90ca Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:27 -0800 Subject: [PATCH 11/12] ipv4: fib: Move fib_valid_key_len() to rtm_to_fib_config(). fib_valid_key_len() is called in the beginning of fib_table_insert() or fib_table_delete() to check if the prefix length is valid. fib_table_insert() and fib_table_delete() are called from 3 paths - ip_rt_ioctl() - inet_rtm_newroute() / inet_rtm_delroute() - fib_magic() In the first ioctl() path, rtentry_to_fib_config() checks the prefix length with bad_mask(). Also, fib_magic() always passes the correct prefix: 32 or ifa->ifa_prefixlen, which is already validated. Let's move fib_valid_key_len() to the rtnetlink path, rtm_to_fib_config(). While at it, 2 direct returns in rtm_to_fib_config() are changed to goto to match other places in the same function Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-12-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 18 ++++++++++++++++-- net/ipv4/fib_trie.c | 22 ---------------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a76dacc3e577a..a6372d934e459 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -835,19 +835,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (cfg->fc_dst_len > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + err = -EINVAL; + goto errout; + } + + if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { + NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); + err = -EINVAL; + goto errout; + } + if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - return -EINVAL; + err = -EINVAL; + goto errout; } if (!cfg->fc_table) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d6411ac810961..59a6f0a9638f9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1187,22 +1187,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) -{ - if (plen > KEYLENGTH) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - return false; - } - - if ((plen < KEYLENGTH) && (key << plen)) { - NL_SET_ERR_MSG(extack, - "Invalid prefix for given prefix length"); - return false; - } - - return true; -} - static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); @@ -1223,9 +1207,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); @@ -1717,9 +1698,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; From 1dd2af7963e95df90590fe40425fe1bdf982ae8f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:28 -0800 Subject: [PATCH 12/12] ipv4: fib: Convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL. We converted fib_info hash tables to per-netns one and now ready to convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL. Let's hold rtnl_net_lock() in inet_rtm_newroute() and inet_rtm_delroute(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-13-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a6372d934e459..6de77415b5b30 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -884,20 +884,24 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; - goto errout; + goto unlock; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; - goto errout; + goto unlock; } err = fib_table_delete(net, tb, &cfg, extack); +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -914,15 +918,20 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; - goto errout; + goto unlock; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; + +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -1683,9 +1692,9 @@ static struct pernet_operations fib_net_ops = { static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, - .doit = inet_rtm_newroute}, + .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELROUTE, - .doit = inet_rtm_delroute}, + .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, };