From e4400bbf5b15750e1b59bf4722d18d99be60c69f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Oct 2021 14:12:35 +0200 Subject: [PATCH 1/4] net, neigh: Fix NTF_EXT_LEARNED in combination with NTF_USE The NTF_EXT_LEARNED neigh flag is usually propagated back to user space upon dump of the neighbor table. However, when used in combination with NTF_USE flag this is not the case despite exempting the entry from the garbage collector. This results in inconsistent state since entries are typically marked in neigh->flags with NTF_EXT_LEARNED, but here they are not. Fix it by propagating the creation flag to ___neigh_create(). Before fix: # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [...] After fix: # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE [...] Fixes: 9ce33e46531d ("neighbour: support for NTF_EXT_LEARNED flag") Signed-off-by: Daniel Borkmann Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/neighbour.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2d5bc3a75faec..8457d5f97022b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -379,7 +379,7 @@ EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, - bool exempt_from_gc) + u8 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; @@ -412,6 +412,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; + n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); @@ -575,19 +576,18 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -static struct neighbour *___neigh_create(struct neigh_table *tbl, - const void *pkey, - struct net_device *dev, - bool exempt_from_gc, bool want_ref) +static struct neighbour * +___neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, u8 flags, + bool exempt_from_gc, bool want_ref) { - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); - u32 hash_val; - unsigned int key_len = tbl->key_len; - int error; + u32 hash_val, key_len = tbl->key_len; + struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; + int error; + n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); - if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; @@ -674,7 +674,7 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, false, want_ref); + return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); } EXPORT_SYMBOL(__neigh_create); @@ -1942,7 +1942,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || ndm->ndm_flags & NTF_EXT_LEARNED; - neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true); + neigh = ___neigh_create(tbl, dst, dev, + ndm->ndm_flags & NTF_EXT_LEARNED, + exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; From 3dc20f4762c62d3b3f0940644881ed818aa7b2f5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Oct 2021 14:12:36 +0200 Subject: [PATCH 2/4] net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE Currently, it is not possible to migrate a neighbor entry between NUD_PERMANENT state and NTF_USE flag with a dynamic NUD state from a user space control plane. Similarly, it is not possible to add/remove NTF_EXT_LEARNED flag from an existing neighbor entry in combination with NTF_USE flag. This is due to the latter directly calling into neigh_event_send() without any meta data updates as happening in __neigh_update(). Thus, to enable this use case, extend the latter with a NEIGH_UPDATE_F_USE flag where we break the NUD_PERMANENT state in particular so that a latter neigh_event_send() is able to re-resolve a neighbor entry. Before fix, NUD_PERMANENT -> NUD_* & NTF_USE: # ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT [...] As can be seen, despite the admin-triggered replace, the entry remains in the NUD_PERMANENT state. After fix, NUD_PERMANENT -> NUD_* & NTF_USE: # ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE [...] # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn STALE [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT [...] After the fix, the admin-triggered replace switches to a dynamic state from the NTF_USE flag which triggered a new neighbor resolution. Likewise, we can transition back from there, if needed, into NUD_PERMANENT. Similar before/after behavior can be observed for below transitions: Before fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE: # ./ip/ip n replace 192.168.178.30 dev enp5s0 use # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [...] After fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE: # ./ip/ip n replace 192.168.178.30 dev enp5s0 use # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [..] Signed-off-by: Daniel Borkmann Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/neighbour.h | 1 + net/core/neighbour.c | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 22ced1381ede5..eb2a7c03a5b00 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -253,6 +253,7 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_OVERRIDE 0x00000001 #define NEIGH_UPDATE_F_WEAK_OVERRIDE 0x00000002 #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER 0x00000004 +#define NEIGH_UPDATE_F_USE 0x10000000 #define NEIGH_UPDATE_F_EXT_LEARNED 0x20000000 #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8457d5f97022b..3e58037a8ae6f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1217,7 +1217,7 @@ static void neigh_update_hhs(struct neighbour *neigh) lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. - + NEIGH_UPDATE_F_USE means that the entry is user triggered. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as @@ -1255,6 +1255,12 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, goto out; ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); + if (flags & NEIGH_UPDATE_F_USE) { + new = old & ~NUD_PERMANENT; + neigh->nud_state = new; + err = 0; + goto out; + } if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1963,22 +1969,20 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (protocol) neigh->protocol = protocol; - if (ndm->ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; - if (ndm->ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; + if (ndm->ndm_flags & NTF_USE) + flags |= NEIGH_UPDATE_F_USE; - if (ndm->ndm_flags & NTF_USE) { + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); + if (!err && ndm->ndm_flags & NTF_USE) { neigh_event_send(neigh, NULL); err = 0; - } else - err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid, extack); - + } neigh_release(neigh); - out: return err; } From 2c611ad97a82b51221bb0920cc6cac0b1d4c0e52 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 11 Oct 2021 14:12:37 +0200 Subject: [PATCH 3/4] net, neigh: Extend neigh->flags to 32 bit to allow for extensions Currently, all bits in struct ndmsg's ndm_flags are used up with the most recent addition of 435f2e7cc0b7 ("net: bridge: add support for sticky fdb entries"). This makes it impossible to extend the neighboring subsystem with new NTF_* flags: struct ndmsg { __u8 ndm_family; __u8 ndm_pad1; __u16 ndm_pad2; __s32 ndm_ifindex; __u16 ndm_state; __u8 ndm_flags; __u8 ndm_type; }; There are ndm_pad{1,2} attributes which are not used. However, due to uncareful design, the kernel does not enforce them to be zero upon new neighbor entry addition, and given they've been around forever, it is not possible to reuse them today due to risk of breakage. One option to overcome this limitation is to add a new NDA_FLAGS_EXT attribute for extended flags. In struct neighbour, there is a 3 byte hole between protocol and ha_lock, which allows neigh->flags to be extended from 8 to 32 bits while still being on the same cacheline as before. This also allows for all future NTF_* flags being in neigh->flags rather than yet another flags field. Unknown flags in NDA_FLAGS_EXT will be rejected by the kernel. Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/neighbour.h | 14 +++++---- include/uapi/linux/neighbour.h | 1 + net/core/neighbour.c | 55 ++++++++++++++++++++++++---------- 3 files changed, 50 insertions(+), 20 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index eb2a7c03a5b00..26d4ada0aea93 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -144,11 +144,11 @@ struct neighbour { struct timer_list timer; unsigned long used; atomic_t probes; - __u8 flags; - __u8 nud_state; - __u8 type; - __u8 dead; + u8 nud_state; + u8 type; + u8 dead; u8 protocol; + u32 flags; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8); struct hh_cache hh; @@ -172,7 +172,7 @@ struct pneigh_entry { struct pneigh_entry *next; possible_net_t net; struct net_device *dev; - u8 flags; + u32 flags; u8 protocol; u8 key[]; }; @@ -258,6 +258,10 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 +/* In-kernel representation for NDA_FLAGS_EXT flags: */ +#define NTF_OLD_MASK 0xff +#define NTF_EXT_SHIFT 8 + extern const struct nla_policy nda_policy[]; static inline bool neigh_key_eq16(const struct neighbour *n, const void *pkey) diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 00a60695fa538..a80cca141855f 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -31,6 +31,7 @@ enum { NDA_PROTOCOL, /* Originator of entry */ NDA_NH_ID, NDA_FDB_EXT_ATTRS, + NDA_FLAGS_EXT, __NDA_MAX }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3e58037a8ae6f..5245e888c981c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -159,7 +159,7 @@ static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags, int *notify) { bool rc = false; - u8 ndm_flags; + u32 ndm_flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) return rc; @@ -379,7 +379,7 @@ EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, - u8 flags, bool exempt_from_gc) + u32 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; @@ -578,7 +578,7 @@ EXPORT_SYMBOL(neigh_lookup_nodev); static struct neighbour * ___neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, u8 flags, + struct net_device *dev, u32 flags, bool exempt_from_gc, bool want_ref) { u32 hash_val, key_len = tbl->key_len; @@ -1789,6 +1789,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FLAGS_EXT] = { .type = NLA_U32 }, [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; @@ -1861,7 +1862,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_OVERRIDE_ISROUTER; + NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; @@ -1870,6 +1871,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct neighbour *neigh; void *dst, *lladdr; u8 protocol = 0; + u32 ndm_flags; int err; ASSERT_RTNL(); @@ -1885,6 +1887,16 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } ndm = nlmsg_data(nlh); + ndm_flags = ndm->ndm_flags; + if (tb[NDA_FLAGS_EXT]) { + u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); + + if (ext & ~0) { + NL_SET_ERR_MSG(extack, "Invalid extended flags"); + goto out; + } + ndm_flags |= (ext << NTF_EXT_SHIFT); + } if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { @@ -1912,14 +1924,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); - - if (ndm->ndm_flags & NTF_PROXY) { + if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; err = -ENOBUFS; pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { - pn->flags = ndm->ndm_flags; + pn->flags = ndm_flags; if (protocol) pn->protocol = protocol; err = 0; @@ -1947,9 +1958,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || - ndm->ndm_flags & NTF_EXT_LEARNED; + ndm_flags & NTF_EXT_LEARNED; neigh = ___neigh_create(tbl, dst, dev, - ndm->ndm_flags & NTF_EXT_LEARNED, + ndm_flags & NTF_EXT_LEARNED, exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); @@ -1969,16 +1980,16 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (protocol) neigh->protocol = protocol; - if (ndm->ndm_flags & NTF_EXT_LEARNED) + if (ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; - if (ndm->ndm_flags & NTF_ROUTER) + if (ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; - if (ndm->ndm_flags & NTF_USE) + if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); - if (!err && ndm->ndm_flags & NTF_USE) { + if (!err && ndm_flags & NTF_USE) { neigh_event_send(neigh, NULL); err = 0; } @@ -2433,6 +2444,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { + u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; @@ -2442,11 +2454,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (nlh == NULL) return -EMSGSIZE; + neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; + neigh_flags = neigh->flags & NTF_OLD_MASK; + ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = neigh->flags; + ndm->ndm_flags = neigh_flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; @@ -2477,6 +2492,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -2490,6 +2507,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { + u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; @@ -2497,11 +2515,14 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nlh == NULL) return -EMSGSIZE; + neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; + neigh_flags = pn->flags & NTF_OLD_MASK; + ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = pn->flags | NTF_PROXY; + ndm->ndm_flags = neigh_flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; @@ -2511,6 +2532,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -2826,6 +2849,7 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } @@ -2854,6 +2878,7 @@ static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } From 7482e3841d520a368426ac196720601687e2dc47 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Oct 2021 14:12:38 +0200 Subject: [PATCH 4/4] net, neigh: Add NTF_MANAGED flag for managed neighbor entries Allow a user space control plane to insert entries with a new NTF_EXT_MANAGED flag. The flag then indicates to the kernel that the neighbor entry should be periodically probed for keeping the entry in NUD_REACHABLE state iff possible. The use case for this is targeting XDP or tc BPF load-balancers which use the bpf_fib_lookup() BPF helper in order to piggyback on neighbor resolution for their backends. Given they cannot be resolved in fast-path, a control plane inserts the L3 (without L2) entries manually into the neighbor table and lets the kernel do the neighbor resolution either on the gateway or on the backend directly in case the latter resides in the same L2. This avoids to deal with L2 in the control plane and to rebuild what the kernel already does best anyway. NTF_EXT_MANAGED can be combined with NTF_EXT_LEARNED in order to avoid GC eviction. The kernel then adds NTF_MANAGED flagged entries to a per-neighbor table which gets triggered by the system work queue to periodically call neigh_event_send() for performing the resolution. The implementation allows migration from/to NTF_MANAGED neighbor entries, so that already existing entries can be converted by the control plane if needed. Potentially, we could make the interval for periodically calling neigh_event_send() configurable; right now it's set to DELAY_PROBE_TIME which is also in line with mlxsw which has similar driver-internal infrastructure c723c735fa6b ("mlxsw: spectrum_router: Periodically update the kernel's neigh table"). In future, the latter could possibly reuse the NTF_MANAGED neighbors as well. Example: # ./ip/ip n replace 192.168.178.30 dev enp5s0 managed extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a managed extern_learn REACHABLE [...] Signed-off-by: Daniel Borkmann Acked-by: Roopa Prabhu Link: https://linuxplumbersconf.org/event/11/contributions/953/ Signed-off-by: David S. Miller --- include/net/neighbour.h | 21 ++++-- include/uapi/linux/neighbour.h | 34 ++++++---- net/core/neighbour.c | 113 ++++++++++++++++++++++++--------- 3 files changed, 120 insertions(+), 48 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 26d4ada0aea93..e8e48be667552 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -155,6 +155,7 @@ struct neighbour { int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; struct list_head gc_list; + struct list_head managed_list; struct rcu_head rcu; struct net_device *dev; u8 primary_key[0]; @@ -216,11 +217,13 @@ struct neigh_table { int gc_thresh3; unsigned long last_flush; struct delayed_work gc_work; + struct delayed_work managed_work; struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; atomic_t gc_entries; struct list_head gc_list; + struct list_head managed_list; rwlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; @@ -250,17 +253,21 @@ static inline void *neighbour_priv(const struct neighbour *n) } /* flags for neigh_update() */ -#define NEIGH_UPDATE_F_OVERRIDE 0x00000001 -#define NEIGH_UPDATE_F_WEAK_OVERRIDE 0x00000002 -#define NEIGH_UPDATE_F_OVERRIDE_ISROUTER 0x00000004 -#define NEIGH_UPDATE_F_USE 0x10000000 -#define NEIGH_UPDATE_F_EXT_LEARNED 0x20000000 -#define NEIGH_UPDATE_F_ISROUTER 0x40000000 -#define NEIGH_UPDATE_F_ADMIN 0x80000000 +#define NEIGH_UPDATE_F_OVERRIDE BIT(0) +#define NEIGH_UPDATE_F_WEAK_OVERRIDE BIT(1) +#define NEIGH_UPDATE_F_OVERRIDE_ISROUTER BIT(2) +#define NEIGH_UPDATE_F_USE BIT(3) +#define NEIGH_UPDATE_F_MANAGED BIT(4) +#define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) +#define NEIGH_UPDATE_F_ISROUTER BIT(6) +#define NEIGH_UPDATE_F_ADMIN BIT(7) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 +#define NTF_EXT_MASK (NTF_EXT_MANAGED) + +#define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index a80cca141855f..db05fb55055e9 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -41,14 +41,16 @@ enum { * Neighbor Cache Entry Flags */ -#define NTF_USE 0x01 -#define NTF_SELF 0x02 -#define NTF_MASTER 0x04 -#define NTF_PROXY 0x08 /* == ATF_PUBL */ -#define NTF_EXT_LEARNED 0x10 -#define NTF_OFFLOADED 0x20 -#define NTF_STICKY 0x40 -#define NTF_ROUTER 0x80 +#define NTF_USE (1 << 0) +#define NTF_SELF (1 << 1) +#define NTF_MASTER (1 << 2) +#define NTF_PROXY (1 << 3) /* == ATF_PUBL */ +#define NTF_EXT_LEARNED (1 << 4) +#define NTF_OFFLOADED (1 << 5) +#define NTF_STICKY (1 << 6) +#define NTF_ROUTER (1 << 7) +/* Extended flags under NDA_FLAGS_EXT: */ +#define NTF_EXT_MANAGED (1 << 0) /* * Neighbor Cache Entry States. @@ -66,12 +68,22 @@ enum { #define NUD_PERMANENT 0x80 #define NUD_NONE 0x00 -/* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change - * and make no address resolution or NUD. - * NUD_PERMANENT also cannot be deleted by garbage collectors. +/* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change and make no + * address resolution or NUD. + * + * NUD_PERMANENT also cannot be deleted by garbage collectors. This holds true + * for dynamic entries with NTF_EXT_LEARNED flag as well. However, upon carrier + * down event, NUD_PERMANENT entries are not flushed whereas NTF_EXT_LEARNED + * flagged entries explicitly are (which is also consistent with the routing + * subsystem). + * * When NTF_EXT_LEARNED is set for a bridge fdb entry the different cache entry * states don't make sense and thus are ignored. Such entries don't age and * can roam. + * + * NTF_EXT_MANAGED flagged neigbor entries are managed by the kernel on behalf + * of a user space control plane, and automatically refreshed so that (if + * possible) they remain in NUD_REACHABLE state. */ struct nda_cacheinfo { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5245e888c981c..eae73efa9245f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -122,6 +122,8 @@ static void neigh_mark_dead(struct neighbour *n) list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } + if (!list_empty(&n->managed_list)) + list_del_init(&n->managed_list); } static void neigh_update_gc_list(struct neighbour *n) @@ -130,7 +132,6 @@ static void neigh_update_gc_list(struct neighbour *n) write_lock_bh(&n->tbl->lock); write_lock(&n->lock); - if (n->dead) goto out; @@ -149,32 +150,59 @@ static void neigh_update_gc_list(struct neighbour *n) list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); } +out: + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); +} + +static void neigh_update_managed_list(struct neighbour *n) +{ + bool on_managed_list, add_to_managed; + + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + if (n->dead) + goto out; + + add_to_managed = n->flags & NTF_MANAGED; + on_managed_list = !list_empty(&n->managed_list); + if (!add_to_managed && on_managed_list) + list_del_init(&n->managed_list); + else if (add_to_managed && !on_managed_list) + list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } -static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags, - int *notify) +static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, + bool *gc_update, bool *managed_update) { - bool rc = false; - u32 ndm_flags; + u32 ndm_flags, old_flags = neigh->flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) - return rc; + return; + + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; - ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; - if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; - rc = true; *notify = 1; + *gc_update = true; + } + if ((old_flags ^ ndm_flags) & NTF_MANAGED) { + if (ndm_flags & NTF_MANAGED) + neigh->flags |= NTF_MANAGED; + else + neigh->flags &= ~NTF_MANAGED; + *notify = 1; + *managed_update = true; } - - return rc; } static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, @@ -422,6 +450,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, refcount_set(&n->refcnt, 1); n->dead = 1; INIT_LIST_HEAD(&n->gc_list); + INIT_LIST_HEAD(&n->managed_list); atomic_inc(&tbl->entries); out: @@ -650,7 +679,8 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, n->dead = 0; if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); - + if (n->flags & NTF_MANAGED) + list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, @@ -1205,8 +1235,6 @@ static void neigh_update_hhs(struct neighbour *neigh) } } - - /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. @@ -1218,6 +1246,7 @@ static void neigh_update_hhs(struct neighbour *neigh) if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_USE means that the entry is user triggered. + NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as @@ -1225,17 +1254,15 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ - static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { - bool ext_learn_change = false; - u8 old; - int err; - int notify = 0; - struct net_device *dev; + bool gc_update = false, managed_update = false; int update_isrouter = 0; + struct net_device *dev; + int err, notify = 0; + u8 old; trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); @@ -1254,8 +1281,8 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); - if (flags & NEIGH_UPDATE_F_USE) { + neigh_update_flags(neigh, flags, ¬ify, &gc_update, &managed_update); + if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { new = old & ~NUD_PERMANENT; neigh->nud_state = new; err = 0; @@ -1405,15 +1432,13 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, if (update_isrouter) neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); - - if (((new ^ old) & NUD_PERMANENT) || ext_learn_change) + if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); - + if (managed_update) + neigh_update_managed_list(neigh); if (notify) neigh_update_notify(neigh, nlmsg_pid); - trace_neigh_update_done(neigh, err); - return err; } @@ -1539,6 +1564,20 @@ int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) } EXPORT_SYMBOL(neigh_direct_output); +static void neigh_managed_work(struct work_struct *work) +{ + struct neigh_table *tbl = container_of(work, struct neigh_table, + managed_work.work); + struct neighbour *neigh; + + write_lock_bh(&tbl->lock); + list_for_each_entry(neigh, &tbl->managed_list, managed_list) + neigh_event_send(neigh, NULL); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, + NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME)); + write_unlock_bh(&tbl->lock); +} + static void neigh_proxy_process(struct timer_list *t) { struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); @@ -1685,6 +1724,8 @@ void neigh_table_init(int index, struct neigh_table *tbl) INIT_LIST_HEAD(&tbl->parms_list); INIT_LIST_HEAD(&tbl->gc_list); + INIT_LIST_HEAD(&tbl->managed_list); + list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1716,9 +1757,13 @@ void neigh_table_init(int index, struct neigh_table *tbl) WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); + INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); + INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); + timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); @@ -1891,7 +1936,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_FLAGS_EXT]) { u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); - if (ext & ~0) { + if (ext & ~NTF_EXT_MASK) { NL_SET_ERR_MSG(extack, "Invalid extended flags"); goto out; } @@ -1927,6 +1972,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; + if (ndm_flags & NTF_MANAGED) { + NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); + goto out; + } + err = -ENOBUFS; pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { @@ -1960,7 +2010,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || ndm_flags & NTF_EXT_LEARNED; neigh = ___neigh_create(tbl, dst, dev, - ndm_flags & NTF_EXT_LEARNED, + ndm_flags & + (NTF_EXT_LEARNED | NTF_MANAGED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); @@ -1984,12 +2035,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, flags |= NEIGH_UPDATE_F_EXT_LEARNED; if (ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; + if (ndm_flags & NTF_MANAGED) + flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); - if (!err && ndm_flags & NTF_USE) { + if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) { neigh_event_send(neigh, NULL); err = 0; }