From 7462fe22cc74321eb663768848976d42eba3ddbb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 7 Mar 2025 12:21:45 +0100 Subject: [PATCH 01/15] mptcp: pm: use addr entry for get_local_id The following code in mptcp_userspace_pm_get_local_id() that assigns "skc" to "new_entry" is not allowed in BPF if we use the same code to implement the get_local_id() interface of a BFP path manager: memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); new_entry.addr = *skc; new_entry.addr.id = 0; new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; To solve the issue, this patch moves this assignment to "new_entry" forward to mptcp_pm_get_local_id(), and then passing "new_entry" as a parameter to both mptcp_pm_nl_get_local_id() and mptcp_userspace_pm_get_local_id(). No behavioural changes intended. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-1-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 9 ++++++--- net/mptcp/pm_netlink.c | 11 ++++------- net/mptcp/pm_userspace.c | 17 ++++++----------- net/mptcp/protocol.h | 6 ++++-- 4 files changed, 20 insertions(+), 23 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 6c8cadf84f31..f6030ce04efd 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -406,7 +406,7 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { - struct mptcp_addr_info skc_local; + struct mptcp_pm_addr_entry skc_local = { 0 }; struct mptcp_addr_info msk_local; if (WARN_ON_ONCE(!msk)) @@ -416,10 +416,13 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) * addr */ mptcp_local_address((struct sock_common *)msk, &msk_local); - mptcp_local_address((struct sock_common *)skc, &skc_local); - if (mptcp_addresses_equal(&msk_local, &skc_local, false)) + mptcp_local_address((struct sock_common *)skc, &skc_local.addr); + if (mptcp_addresses_equal(&msk_local, &skc_local.addr, false)) return 0; + skc_local.addr.id = 0; + skc_local.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; + if (mptcp_pm_is_userspace(msk)) return mptcp_userspace_pm_get_local_id(msk, &skc_local); return mptcp_pm_nl_get_local_id(msk, &skc_local); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a6387bcf848b..23c28e37ab8f 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1150,7 +1150,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, return err; } -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc) { struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; @@ -1159,7 +1160,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); - entry = __lookup_addr(pernet, skc); + entry = __lookup_addr(pernet, &skc->addr); ret = entry ? entry->addr.id : -1; rcu_read_unlock(); if (ret >= 0) @@ -1170,12 +1171,8 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc if (!entry) return -ENOMEM; - entry->addr = *skc; - entry->addr.id = 0; + *entry = *skc; entry->addr.port = 0; - entry->ifindex = 0; - entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; - entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); if (ret < 0) kfree(entry); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 7e7d01bef5d4..8c45eebe9bbc 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -130,27 +130,22 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) } int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, - struct mptcp_addr_info *skc) + struct mptcp_pm_addr_entry *skc) { - struct mptcp_pm_addr_entry *entry = NULL, new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; + struct mptcp_pm_addr_entry *entry; spin_lock_bh(&msk->pm.lock); - entry = mptcp_userspace_pm_lookup_addr(msk, skc); + entry = mptcp_userspace_pm_lookup_addr(msk, &skc->addr); spin_unlock_bh(&msk->pm.lock); if (entry) return entry->addr.id; - memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); - new_entry.addr = *skc; - new_entry.addr.id = 0; - new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; - - if (new_entry.addr.port == msk_sport) - new_entry.addr.port = 0; + if (skc->addr.port == msk_sport) + skc->addr.port = 0; - return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true); + return mptcp_userspace_pm_append_new_local_addr(msk, skc, true); } bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7b74dedc7936..333d20a018b4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1121,8 +1121,10 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); -int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc); +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc); bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); From fac7a6ddc75740ee3d24c7fa054921f9c495d8c2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:46 +0100 Subject: [PATCH 02/15] mptcp: pm: remove '_nl' from mptcp_pm_nl_addr_send_ack Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_addr_send_ack()' is not specific to this PM: it is used by both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-2-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 2 +- net/mptcp/pm_netlink.c | 8 ++++---- net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index f6030ce04efd..ece706e8ed22 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -57,7 +57,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); return 0; } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 23c28e37ab8f..a70a688eae84 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -606,7 +606,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) local.addr.id = 0; mptcp_pm_announce_addr(msk, &local.addr, false); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) signal_and_subflow = true; @@ -740,7 +740,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) remote = msk->pm.remote; mptcp_pm_announce_addr(msk, &remote, true); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) return; @@ -781,7 +781,7 @@ bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, return mptcp_addresses_equal(&mpc_remote, remote, remote->port); } -void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *alt = NULL; @@ -942,7 +942,7 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) } if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 8c45eebe9bbc..b41e1aaa1d1c 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -234,7 +234,7 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); } spin_unlock_bh(&msk->pm.lock); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 333d20a018b4..2a3eb2392b3b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1008,7 +1008,7 @@ void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *remote); -void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); From d1734987992c977f1515a5208688aced653c1869 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:47 +0100 Subject: [PATCH 03/15] mptcp: pm: remove '_nl' from mptcp_pm_nl_mp_prio_send_ack Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_mp_prio_send_ack()' is not specific to this PM: it is used by both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-3-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 10 +++++----- net/mptcp/pm_userspace.c | 4 ++-- net/mptcp/protocol.h | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a70a688eae84..5494b5b409dc 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -808,10 +808,10 @@ void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_send_ack(msk, alt, false, false); } -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - struct mptcp_addr_info *rem, - u8 bkup) +int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup) { struct mptcp_subflow_context *subflow; @@ -1936,7 +1936,7 @@ static void mptcp_nl_set_flags(struct net *net, lock_sock(sk); if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) - mptcp_pm_nl_mp_prio_send_ack(msk, &local->addr, NULL, bkup); + mptcp_pm_mp_prio_send_ack(msk, &local->addr, NULL, bkup); /* Subflows will only be recreated if the SUBFLOW flag is set */ if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)) mptcp_pm_nl_fullmesh(msk, &local->addr); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index b41e1aaa1d1c..2626b2b092d4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -605,10 +605,10 @@ int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, spin_unlock_bh(&msk->pm.lock); lock_sock(sk); - ret = mptcp_pm_nl_mp_prio_send_ack(msk, &local->addr, &rem, bkup); + ret = mptcp_pm_mp_prio_send_ack(msk, &local->addr, &rem, bkup); release_sock(sk); - /* mptcp_pm_nl_mp_prio_send_ack() only fails in one case */ + /* mptcp_pm_mp_prio_send_ack() only fails in one case */ if (ret < 0) GENL_SET_ERR_MSG(info, "subflow not found"); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 2a3eb2392b3b..5508343f2c69 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1013,10 +1013,10 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - struct mptcp_addr_info *rem, - u8 bkup); +int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); From 551a9ad7879df1c7d604b27272fe84032a040074 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:48 +0100 Subject: [PATCH 04/15] mptcp: pm: remove '_nl' from mptcp_pm_nl_work Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_work' is not specific to this PM: it is called from the core to call helpers, some of them needed by both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. Also used 'worker' instead of 'work', similar to protocol.c's worker. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-4-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 2 +- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5494b5b409dc..f6f7ea25640b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -924,7 +924,7 @@ static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } -void mptcp_pm_nl_work(struct mptcp_sock *msk) +void mptcp_pm_worker(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ec23e65ef0f1..ac946263ec64 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2681,7 +2681,7 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_fastclose(msk); - mptcp_pm_nl_work(msk); + mptcp_pm_worker(msk); mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5508343f2c69..f29f4dd28fc5 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1147,7 +1147,7 @@ static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflo } void __init mptcp_pm_nl_init(void); -void mptcp_pm_nl_work(struct mptcp_sock *msk); +void mptcp_pm_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); From 63611391850850bf27f81afb0d0b6d1237a34006 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:49 +0100 Subject: [PATCH 05/15] mptcp: pm: remove '_nl' from mptcp_pm_nl_rm_addr_received Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_rm_addr_received' is not specific to this PM: it is called from the PM worker, and used by both the in-kernel and userspace PMs. The helper has been renamed to 'mptcp_pm_rm_addr_recv' instead of '_received' to avoid confusions with the one from pm.c. mptcp_pm_nl_rm_addr_or_subflow', and 'mptcp_pm_nl_rm_subflow_received' have been updated too for the same reason. To avoid confusions, the '_nl' bit has been removed from the name. While at it, the in-kernel PM specific code has been move from mptcp_pm_rm_addr_or_subflow to a new dedicated helper, clearer. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-5-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 55 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f6f7ea25640b..09ef3aa025e7 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -838,9 +838,20 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, return -EINVAL; } -static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list, - enum linux_mptcp_mib_field rm_type) +static void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) +{ + if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + /* Note: if the subflow has been closed before, this + * add_addr_accepted counter will not be decremented. + */ + if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + WRITE_ONCE(msk->pm.accept_addr, true); + } +} + +static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list, + enum linux_mptcp_mib_field rm_type) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; @@ -893,35 +904,23 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, __MPTCP_INC_STATS(sock_net(sk), rm_type); } - if (rm_type == MPTCP_MIB_RMADDR) + if (rm_type == MPTCP_MIB_RMADDR) { __MPTCP_INC_STATS(sock_net(sk), rm_type); - - if (!removed) - continue; - - if (!mptcp_pm_is_kernel(msk)) - continue; - - if (rm_type == MPTCP_MIB_RMADDR && rm_id && - !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { - /* Note: if the subflow has been closed before, this - * add_addr_accepted counter will not be decremented. - */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) - WRITE_ONCE(msk->pm.accept_addr, true); + if (removed && mptcp_pm_is_kernel(msk)) + mptcp_pm_nl_rm_addr(msk, rm_id); } } } -static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) +static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) { - mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); + mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); } -static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list) +static void mptcp_pm_rm_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) { - mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); + mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } void mptcp_pm_worker(struct mptcp_sock *msk) @@ -946,7 +945,7 @@ void mptcp_pm_worker(struct mptcp_sock *msk) } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); - mptcp_pm_nl_rm_addr_received(msk); + mptcp_pm_rm_addr_recv(msk); } if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); @@ -1538,7 +1537,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, list.ids[0] = mptcp_endp_get_local_id(msk, addr); if (remove_subflow) { spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, &list); + mptcp_pm_rm_subflow(msk, &list); spin_unlock_bh(&msk->pm.lock); } @@ -1583,7 +1582,7 @@ static int mptcp_nl_remove_id_zero_address(struct net *net, lock_sock(sk); spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); - mptcp_pm_nl_rm_subflow_received(msk, &list); + mptcp_pm_rm_subflow(msk, &list); __mark_subflow_endp_available(msk, 0); spin_unlock_bh(&msk->pm.lock); release_sock(sk); @@ -1670,7 +1669,7 @@ static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, mptcp_pm_remove_addr(msk, &alist); } if (slist.nr) - mptcp_pm_nl_rm_subflow_received(msk, &slist); + mptcp_pm_rm_subflow(msk, &slist); /* Reset counters: maybe some subflows have been removed before */ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); msk->pm.local_addr_used = 0; @@ -1910,7 +1909,7 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, &list); + mptcp_pm_rm_subflow(msk, &list); __mark_subflow_endp_available(msk, list.ids[0]); mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); From 550c50bbc2b7950dd4ea1082c016f84469509eab Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:50 +0100 Subject: [PATCH 06/15] mptcp: pm: remove '_nl' from mptcp_pm_nl_subflow_chk_stale() Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_subflow_chk_stale' is not specific to this PM: it is called from pm.c for both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-6-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 2 +- net/mptcp/pm_netlink.c | 2 +- net/mptcp/protocol.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index ece706e8ed22..14c7ff5c606c 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -567,7 +567,7 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { if (subflow->stale_count < U8_MAX) subflow->stale_count++; - mptcp_pm_nl_subflow_chk_stale(msk, ssk); + mptcp_pm_subflows_chk_stale(msk, ssk); } else { subflow->stale_count = 0; mptcp_subflow_set_active(subflow); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 09ef3aa025e7..43667ad4c4ae 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1203,7 +1203,7 @@ static const struct genl_multicast_group mptcp_pm_mcgrps[] = { }, }; -void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) +void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f29f4dd28fc5..a5db1a297fbc 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -992,7 +992,7 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); -void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); +void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); From 498d7d8b75f16a5b6e4f6499f1e72e9296f2d0cd Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:51 +0100 Subject: [PATCH 07/15] mptcp: pm: remove '_nl' from mptcp_pm_nl_is_init_remote_addr Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_is_init_remote_addr' is not specific to this PM: it is called from pm.c for both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-7-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 2 +- net/mptcp/pm_netlink.c | 4 ++-- net/mptcp/protocol.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 14c7ff5c606c..ab443b9f9c5f 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -231,7 +231,7 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } /* id0 should not have a different address */ - } else if ((addr->id == 0 && !mptcp_pm_nl_is_init_remote_addr(msk, addr)) || + } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) || (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 43667ad4c4ae..029a74162b0b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -772,8 +772,8 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) } } -bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote) +bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote) { struct mptcp_addr_info mpc_remote; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a5db1a297fbc..39bcad1def6b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1006,8 +1006,8 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); -bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote); +bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote); void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); From 40aa7409d30df2b610f8e590cbb183faa2ac6f1f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:52 +0100 Subject: [PATCH 08/15] mptcp: pm: kernel: add '_pm' to mptcp_nl_set_flags Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. Here, '_pm' was missing from 'mptcp_nl_set_flags'. Add '_pm' to be similar to others, and add '_all' to avoid confusions witih the global 'mptcp_pm_nl_set_flags'. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-8-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 029a74162b0b..781831c50691 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1915,9 +1915,9 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); } -static void mptcp_nl_set_flags(struct net *net, - struct mptcp_pm_addr_entry *local, - u8 changed) +static void mptcp_pm_nl_set_flags_all(struct net *net, + struct mptcp_pm_addr_entry *local, + u8 changed) { u8 is_subflow = !!(local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW); u8 bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); @@ -1992,7 +1992,7 @@ int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, *local = *entry; spin_unlock_bh(&pernet->lock); - mptcp_nl_set_flags(net, local, changed); + mptcp_pm_nl_set_flags_all(net, local, changed); return 0; } From a17336b2b2e0339b8107b9d4d0fd6b7cf51172e4 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:53 +0100 Subject: [PATCH 09/15] mptcp: pm: avoid calling PM specific code from core When destroying an MPTCP socket, some userspace PM specific code was called from mptcp_destroy_common() in protocol.c. That feels wrong, and it is the only case. Instead, the core now calls mptcp_pm_destroy() from pm.c which is now in charge of cleaning the announced addresses list, and ask the different PMs to do extra cleaning if needed, e.g. the userspace PM, if used, will clean the local addresses list. While at it, the userspace PM specific helper has been prefixed with 'mptcp_userspace_pm_' like the other ones. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-9-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 8 ++++++++ net/mptcp/pm_userspace.c | 5 +---- net/mptcp/protocol.c | 3 +-- net/mptcp/protocol.h | 3 ++- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index ab443b9f9c5f..17f99924dfa0 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -599,6 +599,14 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, #endif } +void mptcp_pm_destroy(struct mptcp_sock *msk) +{ + mptcp_pm_free_anno_list(msk); + + if (mptcp_pm_is_userspace(msk)) + mptcp_userspace_pm_free_local_addr_list(msk); +} + void mptcp_pm_data_reset(struct mptcp_sock *msk) { u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 2626b2b092d4..13856df22673 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -12,15 +12,12 @@ list_for_each_entry(__entry, \ &((__msk)->pm.userspace_pm_local_addr_list), list) -void mptcp_free_local_addr_list(struct mptcp_sock *msk) +void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); - if (!mptcp_pm_is_userspace(msk)) - return; - spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list); spin_unlock_bh(&msk->pm.lock); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ac946263ec64..ad780ae1d30d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3302,8 +3302,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) * inet_sock_destruct() will dispose it */ mptcp_token_destroy(msk); - mptcp_pm_free_anno_list(msk); - mptcp_free_local_addr_list(msk); + mptcp_pm_destroy(msk); } static void mptcp_destroy(struct sock *sk) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 39bcad1def6b..0013b68a2731 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -983,6 +983,7 @@ __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum su void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); +void mptcp_pm_destroy(struct mptcp_sock *msk); int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr); int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, @@ -1042,7 +1043,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry); -void mptcp_free_local_addr_list(struct mptcp_sock *msk); +void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); From a49eb8ae95b8ce24f30a027f286baf8cac08a0ae Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:54 +0100 Subject: [PATCH 10/15] mptcp: pm: worker: split in-kernel and common tasks To make it clear what actions are in-kernel PM specific and which ones are not and done for all PMs, e.g. sending ADD_ADDR and close associated subflows when a RM_ADDR is received. The behavioural is changed a bit: MPTCP_PM_ADD_ADDR_RECEIVED is now treated after MPTCP_PM_ADD_ADDR_SEND_ACK and MPTCP_PM_RM_ADDR_RECEIVED, but that should not change anything in practice. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-10-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 25 +++++++++++++++++++++++++ net/mptcp/pm_netlink.c | 23 +++-------------------- net/mptcp/protocol.h | 2 ++ 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 17f99924dfa0..ddf9d0dc6274 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -599,6 +599,31 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, #endif } +void mptcp_pm_worker(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + msk_owned_by_me(msk); + + if (!(pm->status & MPTCP_PM_WORK_MASK)) + return; + + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x\n", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); + mptcp_pm_addr_send_ack(msk); + } + if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); + mptcp_pm_rm_addr_recv(msk); + } + __mptcp_pm_kernel_worker(msk); + + spin_unlock_bh(&msk->pm.lock); +} + void mptcp_pm_destroy(struct mptcp_sock *msk) { mptcp_pm_free_anno_list(msk); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 781831c50691..37986208b9c0 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -912,7 +912,7 @@ static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk, } } -static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) +void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) { mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); } @@ -923,30 +923,15 @@ static void mptcp_pm_rm_subflow(struct mptcp_sock *msk, mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } -void mptcp_pm_worker(struct mptcp_sock *msk) +/* Called under PM lock */ +void __mptcp_pm_kernel_worker(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - msk_owned_by_me(msk); - - if (!(pm->status & MPTCP_PM_WORK_MASK)) - return; - - spin_lock_bh(&msk->pm.lock); - - pr_debug("msk=%p status=%x\n", msk, pm->status); if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); } - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); - mptcp_pm_addr_send_ack(msk); - } - if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); - mptcp_pm_rm_addr_recv(msk); - } if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); mptcp_pm_nl_fully_established(msk); @@ -955,8 +940,6 @@ void mptcp_pm_worker(struct mptcp_sock *msk) pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); mptcp_pm_nl_subflow_established(msk); } - - spin_unlock_bh(&msk->pm.lock); } static bool address_use_port(struct mptcp_pm_addr_entry *entry) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0013b68a2731..d4725b32aa56 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1010,6 +1010,7 @@ void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *remote); void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); +void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); @@ -1149,6 +1150,7 @@ static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflo void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); +void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); From a14673127236c131cb3a5dbefe41d5c4171bd8a7 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:55 +0100 Subject: [PATCH 11/15] mptcp: pm: export mptcp_remote_address In a following commit, the 'remote_address' helper will need to be used from different files. It is then exported, and prefixed with 'mptcp_', similar to 'mptcp_local_address'. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-11-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 14 +++++++------- net/mptcp/protocol.h | 5 ++++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 37986208b9c0..27b8daf3bc3f 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -94,8 +94,8 @@ void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info * #endif } -static void remote_address(const struct sock_common *skc, - struct mptcp_addr_info *addr) +void mptcp_remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = skc->skc_dport; @@ -138,7 +138,7 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) continue; - remote_address((struct sock_common *)ssk, &cur); + mptcp_remote_address((struct sock_common *)ssk, &cur); if (mptcp_addresses_equal(&cur, daddr, daddr->port)) return true; } @@ -428,7 +428,7 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, int i = 0; subflows_max = mptcp_pm_get_subflows_max(msk); - remote_address((struct sock_common *)sk, &remote); + mptcp_remote_address((struct sock_common *)sk, &remote); /* Non-fullmesh endpoint, fill in the single entry * corresponding to the primary MPC subflow remote address @@ -455,7 +455,7 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); - remote_address((struct sock_common *)ssk, &addrs[i]); + mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); addrs[i].id = READ_ONCE(subflow->remote_id); if (deny_id0 && !addrs[i].id) continue; @@ -777,7 +777,7 @@ bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, { struct mptcp_addr_info mpc_remote; - remote_address((struct sock_common *)msk, &mpc_remote); + mptcp_remote_address((struct sock_common *)msk, &mpc_remote); return mptcp_addresses_equal(&mpc_remote, remote, remote->port); } @@ -826,7 +826,7 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, continue; if (rem && rem->family != AF_UNSPEC) { - remote_address((struct sock_common *)ssk, &remote); + mptcp_remote_address((struct sock_common *)ssk, &remote); if (!mptcp_addresses_equal(&remote, rem, rem->port)) continue; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d4725b32aa56..f66c3d28333f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -724,7 +724,10 @@ void mptcp_set_state(struct sock *sk, int state); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); -void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); +void mptcp_local_address(const struct sock_common *skc, + struct mptcp_addr_info *addr); +void mptcp_remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, From bcc32640ada037f834e61f358e2ee384fdb4039a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:56 +0100 Subject: [PATCH 12/15] mptcp: pm: move generic helper at the top In prevision to another change importing all generic PM helpers from pm_netlink.c to there. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-12-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 54 +++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index ddf9d0dc6274..cd50c5a0c78e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -12,6 +12,33 @@ #include "mib.h" #include "mptcp_pm_gen.h" +/* path manager helpers */ + +/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, + * otherwise allow any matching local/remote pair + */ +bool mptcp_pm_addr_families_match(const struct sock *sk, + const struct mptcp_addr_info *loc, + const struct mptcp_addr_info *rem) +{ + bool mptcp_is_v4 = sk->sk_family == AF_INET; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); + bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); + + if (mptcp_is_v4) + return loc_is_v4 && rem_is_v4; + + if (ipv6_only_sock(sk)) + return !loc_is_v4 && !rem_is_v4; + + return loc_is_v4 == rem_is_v4; +#else + return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; +#endif +} + /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -325,8 +352,6 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) } } -/* path manager helpers */ - bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, @@ -574,31 +599,6 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) } } -/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, - * otherwise allow any matching local/remote pair - */ -bool mptcp_pm_addr_families_match(const struct sock *sk, - const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *rem) -{ - bool mptcp_is_v4 = sk->sk_family == AF_INET; - -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); - bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); - - if (mptcp_is_v4) - return loc_is_v4 && rem_is_v4; - - if (ipv6_only_sock(sk)) - return !loc_is_v4 && !rem_is_v4; - - return loc_is_v4 == rem_is_v4; -#else - return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; -#endif -} - void mptcp_pm_worker(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; From e4c28e3d5c09097b5ca7bc5690e01f48adb33811 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:57 +0100 Subject: [PATCH 13/15] mptcp: pm: move generic PM helpers to pm.c Before this patch, the PM code was dispersed in different places: - pm.c had common code for all PMs - pm_netlink.c was supposed to be about the in-kernel PM, but also had exported common helpers, callbacks used by the different PMs, NL events for PM userspace daemon, etc. quite confusing. - pm_userspace.c had userspace PM only code, but using specific in-kernel PM helpers To clarify the code, a reorganisation is suggested here, only by moving code around, and (un)exporting functions: - helpers used from both PMs and not linked to Netlink - callbacks used by different PMs, e.g. ADD_ADDR management - some helpers have been marked as 'static' - protocol.h has been updated accordingly - (while at it, a needless if before a kfree(), spot by checkpatch in mptcp_remove_anno_list_by_saddr(), has been removed) The code around the PM is now less confusing, which should help for the maintenance in the long term. This will certainly impact future backports, but because other cleanups have already done recently, and more are coming to ease the addition of a new path-manager controlled with BPF (struct_ops), doing that now seems to be a good time. Also, many issues around the PM have been fixed a few months ago while increasing the code coverage in the selftests, so such big reorganisation can be done with more confidence now. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-13-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 460 ++++++++++++++++++++++++++++++++++++++++ net/mptcp/pm_netlink.c | 461 +---------------------------------------- net/mptcp/protocol.h | 14 +- 3 files changed, 467 insertions(+), 468 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index cd50c5a0c78e..d02a0b3adfc4 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -12,6 +12,16 @@ #include "mib.h" #include "mptcp_pm_gen.h" +#define ADD_ADDR_RETRANS_MAX 3 + +struct mptcp_pm_add_entry { + struct list_head list; + struct mptcp_addr_info addr; + u8 retrans_times; + struct timer_list add_timer; + struct mptcp_sock *sock; +}; + /* path manager helpers */ /* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, @@ -39,6 +49,345 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, #endif } +bool mptcp_addresses_equal(const struct mptcp_addr_info *a, + const struct mptcp_addr_info *b, bool use_port) +{ + bool addr_equals = false; + + if (a->family == b->family) { + if (a->family == AF_INET) + addr_equals = a->addr.s_addr == b->addr.s_addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else + addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6); + } else if (a->family == AF_INET) { + if (ipv6_addr_v4mapped(&b->addr6)) + addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; + } else if (b->family == AF_INET) { + if (ipv6_addr_v4mapped(&a->addr6)) + addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; +#endif + } + + if (!addr_equals) + return false; + if (!use_port) + return true; + + return a->port == b->port; +} + +void mptcp_local_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->family = skc->skc_family; + addr->port = htons(skc->skc_num); + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_rcv_saddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_rcv_saddr; +#endif +} + +void mptcp_remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->family = skc->skc_family; + addr->port = skc->skc_dport; + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_daddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_daddr; +#endif +} + +static bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote) +{ + struct mptcp_addr_info mpc_remote; + + mptcp_remote_address((struct sock_common *)msk, &mpc_remote); + return mptcp_addresses_equal(&mpc_remote, remote, remote->port); +} + +bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, + const struct mptcp_addr_info *saddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + struct sock_common *skc; + + list_for_each_entry(subflow, list, node) { + skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + + mptcp_local_address(skc, &cur); + if (mptcp_addresses_equal(&cur, saddr, saddr->port)) + return true; + } + + return false; +} + +static struct mptcp_pm_add_entry * +mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_add_entry *entry; + + lockdep_assert_held(&msk->pm.lock); + + list_for_each_entry(entry, &msk->pm.anno_list, list) { + if (mptcp_addresses_equal(&entry->addr, addr, true)) + return entry; + } + + return NULL; +} + +bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_add_entry *entry; + + entry = mptcp_pm_del_add_timer(msk, addr, false); + kfree(entry); + return entry; +} + +bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) +{ + struct mptcp_pm_add_entry *entry; + struct mptcp_addr_info saddr; + bool ret = false; + + mptcp_local_address((struct sock_common *)sk, &saddr); + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.anno_list, list) { + if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { + ret = true; + goto out; + } + } + +out: + spin_unlock_bh(&msk->pm.lock); + return ret; +} + +static void __mptcp_pm_send_ack(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + bool prio, bool backup) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + pr_debug("send ack for %s\n", + prio ? "mp_prio" : + (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); + + slow = lock_sock_fast(ssk); + if (prio) { + subflow->send_mp_prio = 1; + subflow->request_bkup = backup; + } + + __mptcp_subflow_send_ack(ssk); + unlock_sock_fast(ssk, slow); +} + +void mptcp_pm_send_ack(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + bool prio, bool backup) +{ + spin_unlock_bh(&msk->pm.lock); + __mptcp_pm_send_ack(msk, subflow, prio, backup); + spin_lock_bh(&msk->pm.lock); +} + +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *alt = NULL; + + msk_owned_by_me(msk); + lockdep_assert_held(&msk->pm.lock); + + if (!mptcp_pm_should_add_signal(msk) && + !mptcp_pm_should_rm_signal(msk)) + return; + + mptcp_for_each_subflow(msk, subflow) { + if (__mptcp_subflow_active(subflow)) { + if (!subflow->stale) { + mptcp_pm_send_ack(msk, subflow, false, false); + return; + } + + if (!alt) + alt = subflow; + } + } + + if (alt) + mptcp_pm_send_ack(msk, alt, false, false); +} + +int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup) +{ + struct mptcp_subflow_context *subflow; + + pr_debug("bkup=%d\n", bkup); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct mptcp_addr_info local, remote; + + mptcp_local_address((struct sock_common *)ssk, &local); + if (!mptcp_addresses_equal(&local, addr, addr->port)) + continue; + + if (rem && rem->family != AF_UNSPEC) { + mptcp_remote_address((struct sock_common *)ssk, &remote); + if (!mptcp_addresses_equal(&remote, rem, rem->port)) + continue; + } + + __mptcp_pm_send_ack(msk, subflow, true, bkup); + return 0; + } + + return -EINVAL; +} + +static void mptcp_pm_add_timer(struct timer_list *timer) +{ + struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); + struct mptcp_sock *msk = entry->sock; + struct sock *sk = (struct sock *)msk; + + pr_debug("msk=%p\n", msk); + + if (!msk) + return; + + if (inet_sk_state_load(sk) == TCP_CLOSE) + return; + + if (!entry->addr.id) + return; + + if (mptcp_pm_should_add_signal_addr(msk)) { + sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); + goto out; + } + + spin_lock_bh(&msk->pm.lock); + + if (!mptcp_pm_should_add_signal_addr(msk)) { + pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); + mptcp_pm_announce_addr(msk, &entry->addr, false); + mptcp_pm_add_addr_send_ack(msk); + entry->retrans_times++; + } + + if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) + sk_reset_timer(sk, timer, + jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); + + spin_unlock_bh(&msk->pm.lock); + + if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) + mptcp_pm_subflow_established(msk); + +out: + __sock_put(sk); +} + +struct mptcp_pm_add_entry * +mptcp_pm_del_add_timer(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr, bool check_id) +{ + struct mptcp_pm_add_entry *entry; + struct sock *sk = (struct sock *)msk; + struct timer_list *add_timer = NULL; + + spin_lock_bh(&msk->pm.lock); + entry = mptcp_lookup_anno_list_by_saddr(msk, addr); + if (entry && (!check_id || entry->addr.id == addr->id)) { + entry->retrans_times = ADD_ADDR_RETRANS_MAX; + add_timer = &entry->add_timer; + } + if (!check_id && entry) + list_del(&entry->list); + spin_unlock_bh(&msk->pm.lock); + + /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */ + if (add_timer) + sk_stop_timer_sync(sk, add_timer); + + return entry; +} + +bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_add_entry *add_entry = NULL; + struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); + + lockdep_assert_held(&msk->pm.lock); + + add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); + + if (add_entry) { + if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) + return false; + + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); + return true; + } + + add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); + if (!add_entry) + return false; + + list_add(&add_entry->list, &msk->pm.anno_list); + + add_entry->addr = *addr; + add_entry->sock = msk; + add_entry->retrans_times = 0; + + timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); + + return true; +} + +static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) +{ + struct mptcp_pm_add_entry *entry, *tmp; + struct sock *sk = (struct sock *)msk; + LIST_HEAD(free_list); + + pr_debug("msk=%p\n", msk); + + spin_lock_bh(&msk->pm.lock); + list_splice_init(&msk->pm.anno_list, &free_list); + spin_unlock_bh(&msk->pm.lock); + + list_for_each_entry_safe(entry, tmp, &free_list, list) { + sk_stop_timer_sync(sk, &entry->add_timer); + kfree(entry); + } +} + /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -297,6 +646,80 @@ void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); } +static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list, + enum linux_mptcp_mib_field rm_type) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = (struct sock *)msk; + u8 i; + + pr_debug("%s rm_list_nr %d\n", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); + + msk_owned_by_me(msk); + + if (sk->sk_state == TCP_LISTEN) + return; + + if (!rm_list->nr) + return; + + if (list_empty(&msk->conn_list)) + return; + + for (i = 0; i < rm_list->nr; i++) { + u8 rm_id = rm_list->ids[i]; + bool removed = false; + + mptcp_for_each_subflow_safe(msk, subflow, tmp) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u8 remote_id = READ_ONCE(subflow->remote_id); + int how = RCV_SHUTDOWN | SEND_SHUTDOWN; + u8 id = subflow_get_local_id(subflow); + + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) + continue; + if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) + continue; + if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) + continue; + + pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", + i, rm_id, id, remote_id, msk->mpc_endpoint_id); + spin_unlock_bh(&msk->pm.lock); + mptcp_subflow_shutdown(sk, ssk, how); + removed |= subflow->request_join; + + /* the following takes care of updating the subflows counter */ + mptcp_close_ssk(sk, ssk, subflow); + spin_lock_bh(&msk->pm.lock); + + if (rm_type == MPTCP_MIB_RMSUBFLOW) + __MPTCP_INC_STATS(sock_net(sk), rm_type); + } + + if (rm_type == MPTCP_MIB_RMADDR) { + __MPTCP_INC_STATS(sock_net(sk), rm_type); + if (removed && mptcp_pm_is_kernel(msk)) + mptcp_pm_nl_rm_addr(msk, rm_id); + } + } +} + +static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) +{ + mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); +} + +void mptcp_pm_rm_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) +{ + mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); +} + void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { @@ -580,6 +1003,43 @@ int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) return mptcp_pm_set_flags(info); } +static void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = (struct sock *)msk; + unsigned int active_max_loss_cnt; + struct net *net = sock_net(sk); + unsigned int stale_loss_cnt; + bool slow; + + stale_loss_cnt = mptcp_stale_loss_cnt(net); + if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) + return; + + /* look for another available subflow not in loss state */ + active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); + mptcp_for_each_subflow(msk, iter) { + if (iter != subflow && mptcp_subflow_active(iter) && + iter->stale_count < active_max_loss_cnt) { + /* we have some alternatives, try to mark this subflow as idle ...*/ + slow = lock_sock_fast(ssk); + if (!tcp_rtx_and_write_queues_empty(ssk)) { + subflow->stale = 1; + __mptcp_retransmit_pending_data(sk); + MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); + } + unlock_sock_fast(ssk, slow); + + /* always try to push the pending data regardless of re-injections: + * we can possibly use backup subflows now, and subflow selection + * is cheap under the msk socket lock + */ + __mptcp_push_pending(sk, 0); + return; + } + } +} + void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 27b8daf3bc3f..e4abb94e8c0b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -18,14 +18,6 @@ static int pm_nl_pernet_id; -struct mptcp_pm_add_entry { - struct list_head list; - struct mptcp_addr_info addr; - u8 retrans_times; - struct timer_list add_timer; - struct mptcp_sock *sock; -}; - struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; @@ -41,7 +33,6 @@ struct pm_nl_pernet { }; #define MPTCP_PM_ADDR_MAX 8 -#define ADD_ADDR_RETRANS_MAX 3 static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) { @@ -54,77 +45,6 @@ pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) return pm_nl_get_pernet(sock_net((struct sock *)msk)); } -bool mptcp_addresses_equal(const struct mptcp_addr_info *a, - const struct mptcp_addr_info *b, bool use_port) -{ - bool addr_equals = false; - - if (a->family == b->family) { - if (a->family == AF_INET) - addr_equals = a->addr.s_addr == b->addr.s_addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else - addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6); - } else if (a->family == AF_INET) { - if (ipv6_addr_v4mapped(&b->addr6)) - addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; - } else if (b->family == AF_INET) { - if (ipv6_addr_v4mapped(&a->addr6)) - addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; -#endif - } - - if (!addr_equals) - return false; - if (!use_port) - return true; - - return a->port == b->port; -} - -void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) -{ - addr->family = skc->skc_family; - addr->port = htons(skc->skc_num); - if (addr->family == AF_INET) - addr->addr.s_addr = skc->skc_rcv_saddr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (addr->family == AF_INET6) - addr->addr6 = skc->skc_v6_rcv_saddr; -#endif -} - -void mptcp_remote_address(const struct sock_common *skc, - struct mptcp_addr_info *addr) -{ - addr->family = skc->skc_family; - addr->port = skc->skc_dport; - if (addr->family == AF_INET) - addr->addr.s_addr = skc->skc_daddr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (addr->family == AF_INET6) - addr->addr6 = skc->skc_v6_daddr; -#endif -} - -bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, - const struct mptcp_addr_info *saddr) -{ - struct mptcp_subflow_context *subflow; - struct mptcp_addr_info cur; - struct sock_common *skc; - - list_for_each_entry(subflow, list, node) { - skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); - - mptcp_local_address(skc, &cur); - if (mptcp_addresses_equal(&cur, saddr, saddr->port)) - return true; - } - - return false; -} - static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) { @@ -251,167 +171,6 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) return true; } -struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - struct mptcp_pm_add_entry *entry; - - lockdep_assert_held(&msk->pm.lock); - - list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (mptcp_addresses_equal(&entry->addr, addr, true)) - return entry; - } - - return NULL; -} - -bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) -{ - struct mptcp_pm_add_entry *entry; - struct mptcp_addr_info saddr; - bool ret = false; - - mptcp_local_address((struct sock_common *)sk, &saddr); - - spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { - ret = true; - goto out; - } - } - -out: - spin_unlock_bh(&msk->pm.lock); - return ret; -} - -static void mptcp_pm_add_timer(struct timer_list *timer) -{ - struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); - struct mptcp_sock *msk = entry->sock; - struct sock *sk = (struct sock *)msk; - - pr_debug("msk=%p\n", msk); - - if (!msk) - return; - - if (inet_sk_state_load(sk) == TCP_CLOSE) - return; - - if (!entry->addr.id) - return; - - if (mptcp_pm_should_add_signal_addr(msk)) { - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); - goto out; - } - - spin_lock_bh(&msk->pm.lock); - - if (!mptcp_pm_should_add_signal_addr(msk)) { - pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); - mptcp_pm_announce_addr(msk, &entry->addr, false); - mptcp_pm_add_addr_send_ack(msk); - entry->retrans_times++; - } - - if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, - jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); - - spin_unlock_bh(&msk->pm.lock); - - if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) - mptcp_pm_subflow_established(msk); - -out: - __sock_put(sk); -} - -struct mptcp_pm_add_entry * -mptcp_pm_del_add_timer(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr, bool check_id) -{ - struct mptcp_pm_add_entry *entry; - struct sock *sk = (struct sock *)msk; - struct timer_list *add_timer = NULL; - - spin_lock_bh(&msk->pm.lock); - entry = mptcp_lookup_anno_list_by_saddr(msk, addr); - if (entry && (!check_id || entry->addr.id == addr->id)) { - entry->retrans_times = ADD_ADDR_RETRANS_MAX; - add_timer = &entry->add_timer; - } - if (!check_id && entry) - list_del(&entry->list); - spin_unlock_bh(&msk->pm.lock); - - /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */ - if (add_timer) - sk_stop_timer_sync(sk, add_timer); - - return entry; -} - -bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - struct mptcp_pm_add_entry *add_entry = NULL; - struct sock *sk = (struct sock *)msk; - struct net *net = sock_net(sk); - - lockdep_assert_held(&msk->pm.lock); - - add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); - - if (add_entry) { - if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) - return false; - - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - return true; - } - - add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); - if (!add_entry) - return false; - - list_add(&add_entry->list, &msk->pm.anno_list); - - add_entry->addr = *addr; - add_entry->sock = msk; - add_entry->retrans_times = 0; - - timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - - return true; -} - -void mptcp_pm_free_anno_list(struct mptcp_sock *msk) -{ - struct mptcp_pm_add_entry *entry, *tmp; - struct sock *sk = (struct sock *)msk; - LIST_HEAD(free_list); - - pr_debug("msk=%p\n", msk); - - spin_lock_bh(&msk->pm.lock); - list_splice_init(&msk->pm.anno_list, &free_list); - spin_unlock_bh(&msk->pm.lock); - - list_for_each_entry_safe(entry, tmp, &free_list, list) { - sk_stop_timer_sync(sk, &entry->add_timer); - kfree(entry); - } -} - /* Fill all the remote addresses into the array addrs[], * and return the array size. */ @@ -480,33 +239,6 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, return i; } -static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - bool prio, bool backup) -{ - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow; - - pr_debug("send ack for %s\n", - prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); - - slow = lock_sock_fast(ssk); - if (prio) { - subflow->send_mp_prio = 1; - subflow->request_bkup = backup; - } - - __mptcp_subflow_send_ack(ssk); - unlock_sock_fast(ssk, slow); -} - -static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - bool prio, bool backup) -{ - spin_unlock_bh(&msk->pm.lock); - __mptcp_pm_send_ack(msk, subflow, prio, backup); - spin_lock_bh(&msk->pm.lock); -} - static struct mptcp_pm_addr_entry * __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) { @@ -772,73 +504,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) } } -bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote) -{ - struct mptcp_addr_info mpc_remote; - - mptcp_remote_address((struct sock_common *)msk, &mpc_remote); - return mptcp_addresses_equal(&mpc_remote, remote, remote->port); -} - -void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow, *alt = NULL; - - msk_owned_by_me(msk); - lockdep_assert_held(&msk->pm.lock); - - if (!mptcp_pm_should_add_signal(msk) && - !mptcp_pm_should_rm_signal(msk)) - return; - - mptcp_for_each_subflow(msk, subflow) { - if (__mptcp_subflow_active(subflow)) { - if (!subflow->stale) { - mptcp_pm_send_ack(msk, subflow, false, false); - return; - } - - if (!alt) - alt = subflow; - } - } - - if (alt) - mptcp_pm_send_ack(msk, alt, false, false); -} - -int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - struct mptcp_addr_info *rem, - u8 bkup) -{ - struct mptcp_subflow_context *subflow; - - pr_debug("bkup=%d\n", bkup); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - struct mptcp_addr_info local, remote; - - mptcp_local_address((struct sock_common *)ssk, &local); - if (!mptcp_addresses_equal(&local, addr, addr->port)) - continue; - - if (rem && rem->family != AF_UNSPEC) { - mptcp_remote_address((struct sock_common *)ssk, &remote); - if (!mptcp_addresses_equal(&remote, rem, rem->port)) - continue; - } - - __mptcp_pm_send_ack(msk, subflow, true, bkup); - return 0; - } - - return -EINVAL; -} - -static void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) +void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { /* Note: if the subflow has been closed before, this @@ -849,80 +515,6 @@ static void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) } } -static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list, - enum linux_mptcp_mib_field rm_type) -{ - struct mptcp_subflow_context *subflow, *tmp; - struct sock *sk = (struct sock *)msk; - u8 i; - - pr_debug("%s rm_list_nr %d\n", - rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); - - msk_owned_by_me(msk); - - if (sk->sk_state == TCP_LISTEN) - return; - - if (!rm_list->nr) - return; - - if (list_empty(&msk->conn_list)) - return; - - for (i = 0; i < rm_list->nr; i++) { - u8 rm_id = rm_list->ids[i]; - bool removed = false; - - mptcp_for_each_subflow_safe(msk, subflow, tmp) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - u8 remote_id = READ_ONCE(subflow->remote_id); - int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - u8 id = subflow_get_local_id(subflow); - - if ((1 << inet_sk_state_load(ssk)) & - (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) - continue; - if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) - continue; - if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) - continue; - - pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", - rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", - i, rm_id, id, remote_id, msk->mpc_endpoint_id); - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, how); - removed |= subflow->request_join; - - /* the following takes care of updating the subflows counter */ - mptcp_close_ssk(sk, ssk, subflow); - spin_lock_bh(&msk->pm.lock); - - if (rm_type == MPTCP_MIB_RMSUBFLOW) - __MPTCP_INC_STATS(sock_net(sk), rm_type); - } - - if (rm_type == MPTCP_MIB_RMADDR) { - __MPTCP_INC_STATS(sock_net(sk), rm_type); - if (removed && mptcp_pm_is_kernel(msk)) - mptcp_pm_nl_rm_addr(msk, rm_id); - } - } -} - -void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) -{ - mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); -} - -static void mptcp_pm_rm_subflow(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list) -{ - mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); -} - /* Called under PM lock */ void __mptcp_pm_kernel_worker(struct mptcp_sock *msk) { @@ -1186,43 +778,6 @@ static const struct genl_multicast_group mptcp_pm_mcgrps[] = { }, }; -void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) -{ - struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); - struct sock *sk = (struct sock *)msk; - unsigned int active_max_loss_cnt; - struct net *net = sock_net(sk); - unsigned int stale_loss_cnt; - bool slow; - - stale_loss_cnt = mptcp_stale_loss_cnt(net); - if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) - return; - - /* look for another available subflow not in loss state */ - active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); - mptcp_for_each_subflow(msk, iter) { - if (iter != subflow && mptcp_subflow_active(iter) && - iter->stale_count < active_max_loss_cnt) { - /* we have some alternatives, try to mark this subflow as idle ...*/ - slow = lock_sock_fast(ssk); - if (!tcp_rtx_and_write_queues_empty(ssk)) { - subflow->stale = 1; - __mptcp_retransmit_pending_data(sk); - MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); - } - unlock_sock_fast(ssk, slow); - - /* always try to push the pending data regardless of re-injections: - * we can possibly use backup subflows now, and subflow selection - * is cheap under the msk socket lock - */ - __mptcp_push_pending(sk, 0); - return; - } - } -} - static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1445,20 +1000,6 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) return ret; } -bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - struct mptcp_pm_add_entry *entry; - - entry = mptcp_pm_del_add_timer(msk, addr, false); - if (entry) { - kfree(entry); - return true; - } - - return false; -} - static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f66c3d28333f..360d8cfa5279 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -996,7 +996,6 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); -void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); @@ -1010,10 +1009,13 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); -bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote); +void mptcp_pm_send_ack(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + bool prio, bool backup); void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); -void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk); +void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id); +void mptcp_pm_rm_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); @@ -1024,14 +1026,10 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); -void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id); -struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, - const struct mptcp_addr_info *addr); bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr); bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, From 8617e85e04bd08c6c6aa24c832f9bec10bf76648 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:58 +0100 Subject: [PATCH 14/15] mptcp: pm: split in-kernel PM specific code Before this patch, the PM code was dispersed in different places: - pm.c had common code for all PMs - pm_netlink.c was supposed to be about the in-kernel PM, but also had exported common Netlink helpers, NL events for PM userspace daemons, etc. quite confusing. To clarify the code, a reorganisation is suggested here, only by moving code around to avoid confusions: - pm_netlink.c now only contains common PM Netlink code: - PM events: this code was already there - shared helpers around Netlink code that were already there as well - more shared Netlink commands code from pm.c will come after - pm_kernel.c now contains only code that is specific to the in-kernel PM. Now all functions are either called from: - pm.c: events coming from the core, when this PM is being used - pm_netlink.c: for shared Netlink commands - mptcp_pm_gen.c: for Netlink commands specific to the in-kernel PM - sockopt.c: for the exported counters per netns - (while at it, a useless 'return;' spot by checkpatch at the end of mptcp_pm_nl_set_flags_all, has been removed) The code around the PM is now less confusing, which should help for the maintenance in the long term. This will certainly impact future backports, but because other cleanups have already done recently, and more are coming to ease the addition of a new path-manager controlled with BPF (struct_ops), doing that now seems to be a good time. Also, many issues around the PM have been fixed a few months ago while increasing the code coverage in the selftests, so such big reorganisation can be done with more confidence now. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-14-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/Makefile | 2 +- net/mptcp/pm_kernel.c | 1410 ++++++++++++++++++++++++++++++++++++++++ net/mptcp/pm_netlink.c | 1404 --------------------------------------- 3 files changed, 1411 insertions(+), 1405 deletions(-) create mode 100644 net/mptcp/pm_kernel.c diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index bcf1dbf3a432..89bf6c47c818 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o \ - mptcp_pm_gen.o + mptcp_pm_gen.o pm_kernel.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c new file mode 100644 index 000000000000..daf8f98a3164 --- /dev/null +++ b/net/mptcp/pm_kernel.c @@ -0,0 +1,1410 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2025, Matthieu Baerts. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include + +#include "protocol.h" +#include "mib.h" +#include "mptcp_pm_gen.h" + +static int pm_nl_pernet_id; + +struct pm_nl_pernet { + /* protects pernet updates */ + spinlock_t lock; + struct list_head local_addr_list; + unsigned int addrs; + unsigned int stale_loss_cnt; + unsigned int add_addr_signal_max; + unsigned int add_addr_accept_max; + unsigned int local_addr_max; + unsigned int subflows_max; + unsigned int next_id; + DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); +}; + +#define MPTCP_PM_ADDR_MAX 8 + +static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) +{ + return net_generic(net, pm_nl_pernet_id); +} + +static struct pm_nl_pernet * +pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) +{ + return pm_nl_get_pernet(sock_net((struct sock *)msk)); +} + +static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) +{ + return pm_nl_get_pernet(genl_info_net(info)); +} + +unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) +{ + const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->add_addr_signal_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); + +unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->add_addr_accept_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); + +unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->subflows_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); + +unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->local_addr_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); + +static bool lookup_subflow_by_daddr(const struct list_head *list, + const struct mptcp_addr_info *daddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + + list_for_each_entry(subflow, list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!((1 << inet_sk_state_load(ssk)) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) + continue; + + mptcp_remote_address((struct sock_common *)ssk, &cur); + if (mptcp_addresses_equal(&cur, daddr, daddr->port)) + return true; + } + + return false; +} + +static bool +select_local_address(const struct pm_nl_pernet *pernet, + const struct mptcp_sock *msk, + struct mptcp_pm_local *new_local) +{ + struct mptcp_pm_addr_entry *entry; + bool found = false; + + msk_owned_by_me(msk); + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + continue; + + if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) + continue; + + new_local->addr = entry->addr; + new_local->flags = entry->flags; + new_local->ifindex = entry->ifindex; + found = true; + break; + } + rcu_read_unlock(); + + return found; +} + +static bool +select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, + struct mptcp_pm_local *new_local) +{ + struct mptcp_pm_addr_entry *entry; + bool found = false; + + rcu_read_lock(); + /* do not keep any additional per socket state, just signal + * the address list in order. + * Note: removal from the local address list during the msk life-cycle + * can lead to additional addresses not being announced. + */ + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) + continue; + + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + continue; + + new_local->addr = entry->addr; + new_local->flags = entry->flags; + new_local->ifindex = entry->ifindex; + found = true; + break; + } + rcu_read_unlock(); + + return found; +} + +/* Fill all the remote addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *local, + bool fullmesh, + struct mptcp_addr_info *addrs) +{ + bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); + struct sock *sk = (struct sock *)msk, *ssk; + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info remote = { 0 }; + unsigned int subflows_max; + int i = 0; + + subflows_max = mptcp_pm_get_subflows_max(msk); + mptcp_remote_address((struct sock_common *)sk, &remote); + + /* Non-fullmesh endpoint, fill in the single entry + * corresponding to the primary MPC subflow remote address + */ + if (!fullmesh) { + if (deny_id0) + return 0; + + if (!mptcp_pm_addr_families_match(sk, local, &remote)) + return 0; + + msk->pm.subflows++; + addrs[i++] = remote; + } else { + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + + /* Forbid creation of new subflows matching existing + * ones, possibly already created by incoming ADD_ADDR + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->local_id) == local->id) + __set_bit(subflow->remote_id, unavail_id); + + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); + addrs[i].id = READ_ONCE(subflow->remote_id); + if (deny_id0 && !addrs[i].id) + continue; + + if (test_bit(addrs[i].id, unavail_id)) + continue; + + if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) + continue; + + if (msk->pm.subflows < subflows_max) { + /* forbid creating multiple address towards + * this id + */ + __set_bit(addrs[i].id, unavail_id); + msk->pm.subflows++; + i++; + } + } + } + + return i; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { + if (entry->addr.id == id) + return entry; + } + return NULL; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { + if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) + return entry; + } + return NULL; +} + +static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + unsigned int add_addr_signal_max; + bool signal_and_subflow = false; + unsigned int local_addr_max; + struct pm_nl_pernet *pernet; + struct mptcp_pm_local local; + unsigned int subflows_max; + + pernet = pm_nl_get_pernet(sock_net(sk)); + + add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); + local_addr_max = mptcp_pm_get_local_addr_max(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + + /* do lazy endpoint usage accounting for the MPC subflows */ + if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + bool backup = false; + + mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); + rcu_read_lock(); + entry = __lookup_addr(pernet, &mpc_addr); + if (entry) { + __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); + msk->mpc_endpoint_id = entry->addr.id; + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + } + rcu_read_unlock(); + + if (backup) + mptcp_pm_send_ack(msk, subflow, true, backup); + + msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); + } + + pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", + msk->pm.local_addr_used, local_addr_max, + msk->pm.add_addr_signaled, add_addr_signal_max, + msk->pm.subflows, subflows_max); + + /* check first for announce */ + if (msk->pm.add_addr_signaled < add_addr_signal_max) { + /* due to racing events on both ends we can reach here while + * previous add address is still running: if we invoke now + * mptcp_pm_announce_addr(), that will fail and the + * corresponding id will be marked as used. + * Instead let the PM machinery reschedule us when the + * current address announce will be completed. + */ + if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) + return; + + if (!select_signal_address(pernet, msk, &local)) + goto subflow; + + /* If the alloc fails, we are on memory pressure, not worth + * continuing, and trying to create subflows. + */ + if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) + return; + + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled++; + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + + mptcp_pm_announce_addr(msk, &local.addr, false); + mptcp_pm_addr_send_ack(msk); + + if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + signal_and_subflow = true; + } + +subflow: + /* check if should create a new subflow */ + while (msk->pm.local_addr_used < local_addr_max && + msk->pm.subflows < subflows_max) { + struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; + bool fullmesh; + int i, nr; + + if (signal_and_subflow) + signal_and_subflow = false; + else if (!select_local_address(pernet, msk, &local)) + break; + + fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); + + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + else /* local_addr_used is not decr for ID 0 */ + msk->pm.local_addr_used++; + + nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); + if (nr == 0) + continue; + + spin_unlock_bh(&msk->pm.lock); + for (i = 0; i < nr; i++) + __mptcp_subflow_connect(sk, &local, &addrs[i]); + spin_lock_bh(&msk->pm.lock); + } + mptcp_pm_nl_check_work_pending(msk); +} + +static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +/* Fill all the local addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + struct pm_nl_pernet *pernet; + unsigned int subflows_max; + int i = 0; + + pernet = pm_nl_get_pernet_from_msk(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) + continue; + + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; + + if (msk->pm.subflows < subflows_max) { + locals[i].addr = entry->addr; + locals[i].flags = entry->flags; + locals[i].ifindex = entry->ifindex; + + /* Special case for ID0: set the correct ID */ + if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) + locals[i].addr.id = 0; + + msk->pm.subflows++; + i++; + } + } + rcu_read_unlock(); + + /* If the array is empty, fill in the single + * 'IPADDRANY' local address + */ + if (!i) { + memset(&locals[i], 0, sizeof(locals[i])); + locals[i].addr.family = +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + remote->family == AF_INET6 && + ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : +#endif + remote->family; + + if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) + return 0; + + msk->pm.subflows++; + i++; + } + + return i; +} + +static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) +{ + struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; + struct sock *sk = (struct sock *)msk; + unsigned int add_addr_accept_max; + struct mptcp_addr_info remote; + unsigned int subflows_max; + bool sf_created = false; + int i, nr; + + add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + + pr_debug("accepted %d:%d remote family %d\n", + msk->pm.add_addr_accepted, add_addr_accept_max, + msk->pm.remote.family); + + remote = msk->pm.remote; + mptcp_pm_announce_addr(msk, &remote, true); + mptcp_pm_addr_send_ack(msk); + + if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) + return; + + /* pick id 0 port, if none is provided the remote address */ + if (!remote.port) + remote.port = sk->sk_dport; + + /* connect to the specified remote address, using whatever + * local address the routing configuration will pick. + */ + nr = fill_local_addresses_vec(msk, &remote, locals); + if (nr == 0) + return; + + spin_unlock_bh(&msk->pm.lock); + for (i = 0; i < nr; i++) + if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0) + sf_created = true; + spin_lock_bh(&msk->pm.lock); + + if (sf_created) { + /* add_addr_accepted is not decr for ID 0 */ + if (remote.id) + msk->pm.add_addr_accepted++; + if (msk->pm.add_addr_accepted >= add_addr_accept_max || + msk->pm.subflows >= subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); + } +} + +void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) +{ + if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + /* Note: if the subflow has been closed before, this + * add_addr_accepted counter will not be decremented. + */ + if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + WRITE_ONCE(msk->pm.accept_addr, true); + } +} + +static bool address_use_port(struct mptcp_pm_addr_entry *entry) +{ + return (entry->flags & + (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == + MPTCP_PM_ADDR_FLAG_SIGNAL; +} + +/* caller must ensure the RCU grace period is already elapsed */ +static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) +{ + if (entry->lsk) + sock_release(entry->lsk); + kfree(entry); +} + +static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, + struct mptcp_pm_addr_entry *entry, + bool needs_id, bool replace) +{ + struct mptcp_pm_addr_entry *cur, *del_entry = NULL; + unsigned int addr_max; + int ret = -EINVAL; + + spin_lock_bh(&pernet->lock); + /* to keep the code simple, don't do IDR-like allocation for address ID, + * just bail when we exceed limits + */ + if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) + pernet->next_id = 1; + if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { + ret = -ERANGE; + goto out; + } + if (test_bit(entry->addr.id, pernet->id_bitmap)) { + ret = -EBUSY; + goto out; + } + + /* do not insert duplicate address, differentiate on port only + * singled addresses + */ + if (!address_use_port(entry)) + entry->addr.port = 0; + list_for_each_entry(cur, &pernet->local_addr_list, list) { + if (mptcp_addresses_equal(&cur->addr, &entry->addr, + cur->addr.port || entry->addr.port)) { + /* allow replacing the exiting endpoint only if such + * endpoint is an implicit one and the user-space + * did not provide an endpoint id + */ + if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { + ret = -EEXIST; + goto out; + } + if (entry->addr.id) + goto out; + + /* allow callers that only need to look up the local + * addr's id to skip replacement. This allows them to + * avoid calling synchronize_rcu in the packet recv + * path. + */ + if (!replace) { + kfree(entry); + ret = cur->addr.id; + goto out; + } + + pernet->addrs--; + entry->addr.id = cur->addr.id; + list_del_rcu(&cur->list); + del_entry = cur; + break; + } + } + + if (!entry->addr.id && needs_id) { +find_next: + entry->addr.id = find_next_zero_bit(pernet->id_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, + pernet->next_id); + if (!entry->addr.id && pernet->next_id != 1) { + pernet->next_id = 1; + goto find_next; + } + } + + if (!entry->addr.id && needs_id) + goto out; + + __set_bit(entry->addr.id, pernet->id_bitmap); + if (entry->addr.id > pernet->next_id) + pernet->next_id = entry->addr.id; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + addr_max = pernet->add_addr_signal_max; + WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + addr_max = pernet->local_addr_max; + WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + } + + pernet->addrs++; + if (!entry->addr.port) + list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + else + list_add_rcu(&entry->list, &pernet->local_addr_list); + ret = entry->addr.id; + +out: + spin_unlock_bh(&pernet->lock); + + /* just replaced an existing entry, free it */ + if (del_entry) { + synchronize_rcu(); + __mptcp_pm_release_addr_entry(del_entry); + } + return ret; +} + +static struct lock_class_key mptcp_slock_keys[2]; +static struct lock_class_key mptcp_keys[2]; + +static int mptcp_pm_nl_create_listen_socket(struct sock *sk, + struct mptcp_pm_addr_entry *entry) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + int addrlen = sizeof(struct sockaddr_in); + struct sockaddr_storage addr; + struct sock *newsk, *ssk; + int backlog = 1024; + int err; + + err = sock_create_kern(sock_net(sk), entry->addr.family, + SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); + if (err) + return err; + + newsk = entry->lsk->sk; + if (!newsk) + return -EINVAL; + + /* The subflow socket lock is acquired in a nested to the msk one + * in several places, even by the TCP stack, and this msk is a kernel + * socket: lockdep complains. Instead of propagating the _nested + * modifiers in several places, re-init the lock class for the msk + * socket to an mptcp specific one. + */ + sock_lock_init_class_and_name(newsk, + is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", + &mptcp_slock_keys[is_ipv6], + is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", + &mptcp_keys[is_ipv6]); + + lock_sock(newsk); + ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); + release_sock(newsk); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); + + mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (entry->addr.family == AF_INET6) + addrlen = sizeof(struct sockaddr_in6); +#endif + if (ssk->sk_family == AF_INET) + err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (ssk->sk_family == AF_INET6) + err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#endif + if (err) + return err; + + /* We don't use mptcp_set_state() here because it needs to be called + * under the msk socket lock. For the moment, that will not bring + * anything more than only calling inet_sk_state_store(), because the + * old status is known (TCP_CLOSE). + */ + inet_sk_state_store(newsk, TCP_LISTEN); + lock_sock(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); + err = __inet_listen_sk(ssk, backlog); + if (!err) + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); + release_sock(ssk); + return err; +} + +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc) +{ + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + int ret; + + pernet = pm_nl_get_pernet_from_msk(msk); + + rcu_read_lock(); + entry = __lookup_addr(pernet, &skc->addr); + ret = entry ? entry->addr.id : -1; + rcu_read_unlock(); + if (ret >= 0) + return ret; + + /* address not found, add to local list */ + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + return -ENOMEM; + + *entry = *skc; + entry->addr.port = 0; + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); + if (ret < 0) + kfree(entry); + + return ret; +} + +bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + struct mptcp_pm_addr_entry *entry; + bool backup; + + rcu_read_lock(); + entry = __lookup_addr(pernet, skc); + backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + rcu_read_unlock(); + + return backup; +} + +static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, + struct mptcp_addr_info *addr) +{ + struct mptcp_sock *msk; + long s_slot = 0, s_num = 0; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info mpc_addr; + + if (!READ_ONCE(msk->fully_established) || + mptcp_pm_is_userspace(msk)) + goto next; + + /* if the endp linked to the init sf is re-added with a != ID */ + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) + msk->mpc_endpoint_id = addr->id; + mptcp_pm_create_subflow_or_signal_addr(msk); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + +static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, + struct genl_info *info) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + + if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, + mptcp_pm_address_nl_policy, info->extack) && + tb[MPTCP_PM_ADDR_ATTR_ID]) + return true; + return false; +} + +/* Add an MPTCP endpoint */ +int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + struct nlattr *attr; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; + ret = mptcp_pm_parse_entry(attr, info, true, &addr); + if (ret < 0) + return ret; + + if (addr.addr.port && !address_use_port(&addr)) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags must have signal and not subflow when using port"); + return -EINVAL; + } + + if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && + addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags mustn't have both signal and fullmesh"); + return -EINVAL; + } + + if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "can't create IMPLICIT endpoint"); + return -EINVAL; + } + + entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); + if (!entry) { + GENL_SET_ERR_MSG(info, "can't allocate addr"); + return -ENOMEM; + } + + *entry = addr; + if (entry->addr.port) { + ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); + if (ret) { + GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); + goto out_free; + } + } + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, + !mptcp_pm_has_addr_attr_id(attr, info), + true); + if (ret < 0) { + GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); + goto out_free; + } + + mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); + return 0; + +out_free: + __mptcp_pm_release_addr_entry(entry); + return ret; +} + +static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; +} + +static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr, + bool force) +{ + struct mptcp_rm_list list = { .nr = 0 }; + bool ret; + + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); + + ret = mptcp_remove_anno_list_by_saddr(msk, addr); + if (ret || force) { + spin_lock_bh(&msk->pm.lock); + if (ret) { + __set_bit(addr->id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled--; + } + mptcp_pm_remove_addr(msk, &list); + spin_unlock_bh(&msk->pm.lock); + } + return ret; +} + +static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) +{ + /* If it was marked as used, and not ID 0, decrement local_addr_used */ + if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && + id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) + msk->pm.local_addr_used--; +} + +static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, + const struct mptcp_pm_addr_entry *entry) +{ + const struct mptcp_addr_info *addr = &entry->addr; + struct mptcp_rm_list list = { .nr = 1 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + pr_debug("remove_id=%d\n", addr->id); + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + bool remove_subflow; + + if (mptcp_pm_is_userspace(msk)) + goto next; + + lock_sock(sk); + remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); + mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && + !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); + + list.ids[0] = mptcp_endp_get_local_id(msk, addr); + if (remove_subflow) { + spin_lock_bh(&msk->pm.lock); + mptcp_pm_rm_subflow(msk, &list); + spin_unlock_bh(&msk->pm.lock); + } + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + spin_lock_bh(&msk->pm.lock); + __mark_subflow_endp_available(msk, list.ids[0]); + spin_unlock_bh(&msk->pm.lock); + } + + if (msk->mpc_endpoint_id == entry->addr.id) + msk->mpc_endpoint_id = 0; + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + +static int mptcp_nl_remove_id_zero_address(struct net *net, + struct mptcp_addr_info *addr) +{ + struct mptcp_rm_list list = { .nr = 0 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + list.ids[list.nr++] = 0; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info msk_local; + + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) + goto next; + + mptcp_local_address((struct sock_common *)msk, &msk_local); + if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) + goto next; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + mptcp_pm_remove_addr(msk, &list); + mptcp_pm_rm_subflow(msk, &list); + __mark_subflow_endp_available(msk, 0); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + +/* Remove an MPTCP endpoint */ +int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + unsigned int addr_max; + struct nlattr *attr; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; + ret = mptcp_pm_parse_entry(attr, info, false, &addr); + if (ret < 0) + return ret; + + /* the zero id address is special: the first address used by the msk + * always gets such an id, so different subflows can have different zero + * id addresses. Additionally zero id is not accounted for in id_bitmap. + * Let's use an 'mptcp_rm_list' instead of the common remove code. + */ + if (addr.addr.id == 0) + return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr_by_id(pernet, addr.addr.id); + if (!entry) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + spin_unlock_bh(&pernet->lock); + return -EINVAL; + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + addr_max = pernet->add_addr_signal_max; + WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + addr_max = pernet->local_addr_max; + WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + } + + pernet->addrs--; + list_del_rcu(&entry->list); + __clear_bit(entry->addr.id, pernet->id_bitmap); + spin_unlock_bh(&pernet->lock); + + mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); + synchronize_rcu(); + __mptcp_pm_release_addr_entry(entry); + + return ret; +} + +static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) +{ + struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, rm_list, list) { + if (slist.nr < MPTCP_RM_IDS_MAX && + mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) + slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); + + if (alist.nr < MPTCP_RM_IDS_MAX && + mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) + alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); + } + + spin_lock_bh(&msk->pm.lock); + if (alist.nr) { + msk->pm.add_addr_signaled -= alist.nr; + mptcp_pm_remove_addr(msk, &alist); + } + if (slist.nr) + mptcp_pm_rm_subflow(msk, &slist); + /* Reset counters: maybe some subflows have been removed before */ + bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + msk->pm.local_addr_used = 0; + spin_unlock_bh(&msk->pm.lock); +} + +static void mptcp_nl_flush_addrs_list(struct net *net, + struct list_head *rm_list) +{ + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + if (list_empty(rm_list)) + return; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + if (!mptcp_pm_is_userspace(msk)) { + lock_sock(sk); + mptcp_pm_flush_addrs_and_subflows(msk, rm_list); + release_sock(sk); + } + + sock_put(sk); + cond_resched(); + } +} + +/* caller must ensure the RCU grace period is already elapsed */ +static void __flush_addrs(struct list_head *list) +{ + while (!list_empty(list)) { + struct mptcp_pm_addr_entry *cur; + + cur = list_entry(list->next, + struct mptcp_pm_addr_entry, list); + list_del_rcu(&cur->list); + __mptcp_pm_release_addr_entry(cur); + } +} + +static void __reset_counters(struct pm_nl_pernet *pernet) +{ + WRITE_ONCE(pernet->add_addr_signal_max, 0); + WRITE_ONCE(pernet->add_addr_accept_max, 0); + WRITE_ONCE(pernet->local_addr_max, 0); + pernet->addrs = 0; +} + +int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + LIST_HEAD(free_list); + + spin_lock_bh(&pernet->lock); + list_splice_init(&pernet->local_addr_list, &free_list); + __reset_counters(pernet); + pernet->next_id = 1; + bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + spin_unlock_bh(&pernet->lock); + mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); + synchronize_rcu(); + __flush_addrs(&free_list); + return 0; +} + +int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry *entry; + int ret = -EINVAL; + + rcu_read_lock(); + entry = __lookup_addr_by_id(pernet, id); + if (entry) { + *addr = *entry; + ret = 0; + } + rcu_read_unlock(); + + return ret; +} + +int mptcp_pm_nl_dump_addr(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct net *net = sock_net(msg->sk); + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + int id = cb->args[0]; + int i; + + pernet = pm_nl_get_pernet(net); + + rcu_read_lock(); + for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { + if (test_bit(i, pernet->id_bitmap)) { + entry = __lookup_addr_by_id(pernet, i); + if (!entry) + break; + + if (entry->addr.id <= id) + continue; + + if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) + break; + + id = entry->addr.id; + } + } + rcu_read_unlock(); + + cb->args[0] = id; + return msg->len; +} + +static int parse_limit(struct genl_info *info, int id, unsigned int *limit) +{ + struct nlattr *attr = info->attrs[id]; + + if (!attr) + return 0; + + *limit = nla_get_u32(attr); + if (*limit > MPTCP_PM_ADDR_MAX) { + NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr, + "limit greater than maximum (%u)", + MPTCP_PM_ADDR_MAX); + return -EINVAL; + } + return 0; +} + +int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + unsigned int rcv_addrs, subflows; + int ret; + + spin_lock_bh(&pernet->lock); + rcv_addrs = pernet->add_addr_accept_max; + ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); + if (ret) + goto unlock; + + subflows = pernet->subflows_max; + ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); + if (ret) + goto unlock; + + WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->subflows_max, subflows); + +unlock: + spin_unlock_bh(&pernet->lock); + return ret; +} + +int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct sk_buff *msg; + void *reply; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + MPTCP_PM_CMD_GET_LIMITS); + if (!reply) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, + READ_ONCE(pernet->add_addr_accept_max))) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, + READ_ONCE(pernet->subflows_max))) + goto fail; + + genlmsg_end(msg, reply); + return genlmsg_reply(msg, info); + +fail: + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + nlmsg_free(msg); + return -EMSGSIZE; +} + +static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *addr) +{ + struct mptcp_rm_list list = { .nr = 0 }; + + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); + + spin_lock_bh(&msk->pm.lock); + mptcp_pm_rm_subflow(msk, &list); + __mark_subflow_endp_available(msk, list.ids[0]); + mptcp_pm_create_subflow_or_signal_addr(msk); + spin_unlock_bh(&msk->pm.lock); +} + +static void mptcp_pm_nl_set_flags_all(struct net *net, + struct mptcp_pm_addr_entry *local, + u8 changed) +{ + u8 is_subflow = !!(local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW); + u8 bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + if (changed == MPTCP_PM_ADDR_FLAG_FULLMESH && !is_subflow) + return; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) + goto next; + + lock_sock(sk); + if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) + mptcp_pm_mp_prio_send_ack(msk, &local->addr, NULL, bkup); + /* Subflows will only be recreated if the SUBFLOW flag is set */ + if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)) + mptcp_pm_nl_fullmesh(msk, &local->addr); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } +} + +int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | + MPTCP_PM_ADDR_FLAG_FULLMESH; + struct net *net = genl_info_net(info); + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + u8 lookup_by_id = 0; + + pernet = pm_nl_get_pernet(net); + + if (local->addr.family == AF_UNSPEC) { + lookup_by_id = 1; + if (!local->addr.id) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing address ID"); + return -EOPNOTSUPP; + } + } + + spin_lock_bh(&pernet->lock); + entry = lookup_by_id ? __lookup_addr_by_id(pernet, local->addr.id) : + __lookup_addr(pernet, &local->addr); + if (!entry) { + spin_unlock_bh(&pernet->lock); + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + return -EINVAL; + } + if ((local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && + (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | + MPTCP_PM_ADDR_FLAG_IMPLICIT))) { + spin_unlock_bh(&pernet->lock); + NL_SET_ERR_MSG_ATTR(info->extack, attr, "invalid addr flags"); + return -EINVAL; + } + + changed = (local->flags ^ entry->flags) & mask; + entry->flags = (entry->flags & ~mask) | (local->flags & mask); + *local = *entry; + spin_unlock_bh(&pernet->lock); + + mptcp_pm_nl_set_flags_all(net, local, changed); + return 0; +} + +bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { + WRITE_ONCE(msk->pm.work_pending, false); + return false; + } + return true; +} + +/* Called under PM lock */ +void __mptcp_pm_kernel_worker(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } +} + +static int __net_init pm_nl_init_net(struct net *net) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); + + INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + + /* Cit. 2 subflows ought to be enough for anybody. */ + pernet->subflows_max = 2; + pernet->next_id = 1; + pernet->stale_loss_cnt = 4; + spin_lock_init(&pernet->lock); + + /* No need to initialize other pernet fields, the struct is zeroed at + * allocation time. + */ + + return 0; +} + +static void __net_exit pm_nl_exit_net(struct list_head *net_list) +{ + struct net *net; + + list_for_each_entry(net, net_list, exit_list) { + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); + + /* net is removed from namespace list, can't race with + * other modifiers, also netns core already waited for a + * RCU grace period. + */ + __flush_addrs(&pernet->local_addr_list); + } +} + +static struct pernet_operations mptcp_pm_pernet_ops = { + .init = pm_nl_init_net, + .exit_batch = pm_nl_exit_net, + .id = &pm_nl_pernet_id, + .size = sizeof(struct pm_nl_pernet), +}; + +void __init mptcp_pm_nl_init(void) +{ + if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) + panic("Failed to register MPTCP PM pernet subsystem.\n"); + + if (genl_register_family(&mptcp_genl_family)) + panic("Failed to register MPTCP PM netlink family\n"); +} diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index e4abb94e8c0b..530b2362a5a3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -6,768 +6,9 @@ #define pr_fmt(fmt) "MPTCP: " fmt -#include -#include -#include -#include -#include - #include "protocol.h" -#include "mib.h" #include "mptcp_pm_gen.h" -static int pm_nl_pernet_id; - -struct pm_nl_pernet { - /* protects pernet updates */ - spinlock_t lock; - struct list_head local_addr_list; - unsigned int addrs; - unsigned int stale_loss_cnt; - unsigned int add_addr_signal_max; - unsigned int add_addr_accept_max; - unsigned int local_addr_max; - unsigned int subflows_max; - unsigned int next_id; - DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); -}; - -#define MPTCP_PM_ADDR_MAX 8 - -static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) -{ - return net_generic(net, pm_nl_pernet_id); -} - -static struct pm_nl_pernet * -pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) -{ - return pm_nl_get_pernet(sock_net((struct sock *)msk)); -} - -static bool lookup_subflow_by_daddr(const struct list_head *list, - const struct mptcp_addr_info *daddr) -{ - struct mptcp_subflow_context *subflow; - struct mptcp_addr_info cur; - - list_for_each_entry(subflow, list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - if (!((1 << inet_sk_state_load(ssk)) & - (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) - continue; - - mptcp_remote_address((struct sock_common *)ssk, &cur); - if (mptcp_addresses_equal(&cur, daddr, daddr->port)) - return true; - } - - return false; -} - -static bool -select_local_address(const struct pm_nl_pernet *pernet, - const struct mptcp_sock *msk, - struct mptcp_pm_local *new_local) -{ - struct mptcp_pm_addr_entry *entry; - bool found = false; - - msk_owned_by_me(msk); - - rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) - continue; - - if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) - continue; - - new_local->addr = entry->addr; - new_local->flags = entry->flags; - new_local->ifindex = entry->ifindex; - found = true; - break; - } - rcu_read_unlock(); - - return found; -} - -static bool -select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, - struct mptcp_pm_local *new_local) -{ - struct mptcp_pm_addr_entry *entry; - bool found = false; - - rcu_read_lock(); - /* do not keep any additional per socket state, just signal - * the address list in order. - * Note: removal from the local address list during the msk life-cycle - * can lead to additional addresses not being announced. - */ - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) - continue; - - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) - continue; - - new_local->addr = entry->addr; - new_local->flags = entry->flags; - new_local->ifindex = entry->ifindex; - found = true; - break; - } - rcu_read_unlock(); - - return found; -} - -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) -{ - const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->add_addr_signal_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); - -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->add_addr_accept_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); - -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->subflows_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); - -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->local_addr_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); - -bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || - (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, - MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { - WRITE_ONCE(msk->pm.work_pending, false); - return false; - } - return true; -} - -/* Fill all the remote addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *local, - bool fullmesh, - struct mptcp_addr_info *addrs) -{ - bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); - struct sock *sk = (struct sock *)msk, *ssk; - struct mptcp_subflow_context *subflow; - struct mptcp_addr_info remote = { 0 }; - unsigned int subflows_max; - int i = 0; - - subflows_max = mptcp_pm_get_subflows_max(msk); - mptcp_remote_address((struct sock_common *)sk, &remote); - - /* Non-fullmesh endpoint, fill in the single entry - * corresponding to the primary MPC subflow remote address - */ - if (!fullmesh) { - if (deny_id0) - return 0; - - if (!mptcp_pm_addr_families_match(sk, local, &remote)) - return 0; - - msk->pm.subflows++; - addrs[i++] = remote; - } else { - DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); - - /* Forbid creation of new subflows matching existing - * ones, possibly already created by incoming ADD_ADDR - */ - bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); - mptcp_for_each_subflow(msk, subflow) - if (READ_ONCE(subflow->local_id) == local->id) - __set_bit(subflow->remote_id, unavail_id); - - mptcp_for_each_subflow(msk, subflow) { - ssk = mptcp_subflow_tcp_sock(subflow); - mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); - addrs[i].id = READ_ONCE(subflow->remote_id); - if (deny_id0 && !addrs[i].id) - continue; - - if (test_bit(addrs[i].id, unavail_id)) - continue; - - if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) - continue; - - if (msk->pm.subflows < subflows_max) { - /* forbid creating multiple address towards - * this id - */ - __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; - i++; - } - } - } - - return i; -} - -static struct mptcp_pm_addr_entry * -__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) -{ - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, - lockdep_is_held(&pernet->lock)) { - if (entry->addr.id == id) - return entry; - } - return NULL; -} - -static struct mptcp_pm_addr_entry * -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) -{ - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, - lockdep_is_held(&pernet->lock)) { - if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) - return entry; - } - return NULL; -} - -static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) -{ - struct sock *sk = (struct sock *)msk; - unsigned int add_addr_signal_max; - bool signal_and_subflow = false; - unsigned int local_addr_max; - struct pm_nl_pernet *pernet; - struct mptcp_pm_local local; - unsigned int subflows_max; - - pernet = pm_nl_get_pernet(sock_net(sk)); - - add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); - local_addr_max = mptcp_pm_get_local_addr_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - /* do lazy endpoint usage accounting for the MPC subflows */ - if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); - struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - bool backup = false; - - mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); - rcu_read_lock(); - entry = __lookup_addr(pernet, &mpc_addr); - if (entry) { - __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); - msk->mpc_endpoint_id = entry->addr.id; - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - } - rcu_read_unlock(); - - if (backup) - mptcp_pm_send_ack(msk, subflow, true, backup); - - msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); - } - - pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, local_addr_max, - msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.subflows, subflows_max); - - /* check first for announce */ - if (msk->pm.add_addr_signaled < add_addr_signal_max) { - /* due to racing events on both ends we can reach here while - * previous add address is still running: if we invoke now - * mptcp_pm_announce_addr(), that will fail and the - * corresponding id will be marked as used. - * Instead let the PM machinery reschedule us when the - * current address announce will be completed. - */ - if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) - return; - - if (!select_signal_address(pernet, msk, &local)) - goto subflow; - - /* If the alloc fails, we are on memory pressure, not worth - * continuing, and trying to create subflows. - */ - if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) - return; - - __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); - msk->pm.add_addr_signaled++; - - /* Special case for ID0: set the correct ID */ - if (local.addr.id == msk->mpc_endpoint_id) - local.addr.id = 0; - - mptcp_pm_announce_addr(msk, &local.addr, false); - mptcp_pm_addr_send_ack(msk); - - if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) - signal_and_subflow = true; - } - -subflow: - /* check if should create a new subflow */ - while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { - struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; - bool fullmesh; - int i, nr; - - if (signal_and_subflow) - signal_and_subflow = false; - else if (!select_local_address(pernet, msk, &local)) - break; - - fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); - - __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); - - /* Special case for ID0: set the correct ID */ - if (local.addr.id == msk->mpc_endpoint_id) - local.addr.id = 0; - else /* local_addr_used is not decr for ID 0 */ - msk->pm.local_addr_used++; - - nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); - if (nr == 0) - continue; - - spin_unlock_bh(&msk->pm.lock); - for (i = 0; i < nr; i++) - __mptcp_subflow_connect(sk, &local, &addrs[i]); - spin_lock_bh(&msk->pm.lock); - } - mptcp_pm_nl_check_work_pending(msk); -} - -static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) -{ - mptcp_pm_create_subflow_or_signal_addr(msk); -} - -static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) -{ - mptcp_pm_create_subflow_or_signal_addr(msk); -} - -/* Fill all the local addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *remote, - struct mptcp_pm_local *locals) -{ - struct sock *sk = (struct sock *)msk; - struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - struct pm_nl_pernet *pernet; - unsigned int subflows_max; - int i = 0; - - pernet = pm_nl_get_pernet_from_msk(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - - rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) - continue; - - if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) - continue; - - if (msk->pm.subflows < subflows_max) { - locals[i].addr = entry->addr; - locals[i].flags = entry->flags; - locals[i].ifindex = entry->ifindex; - - /* Special case for ID0: set the correct ID */ - if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) - locals[i].addr.id = 0; - - msk->pm.subflows++; - i++; - } - } - rcu_read_unlock(); - - /* If the array is empty, fill in the single - * 'IPADDRANY' local address - */ - if (!i) { - memset(&locals[i], 0, sizeof(locals[i])); - locals[i].addr.family = -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - remote->family == AF_INET6 && - ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : -#endif - remote->family; - - if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) - return 0; - - msk->pm.subflows++; - i++; - } - - return i; -} - -static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) -{ - struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; - struct sock *sk = (struct sock *)msk; - unsigned int add_addr_accept_max; - struct mptcp_addr_info remote; - unsigned int subflows_max; - bool sf_created = false; - int i, nr; - - add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - pr_debug("accepted %d:%d remote family %d\n", - msk->pm.add_addr_accepted, add_addr_accept_max, - msk->pm.remote.family); - - remote = msk->pm.remote; - mptcp_pm_announce_addr(msk, &remote, true); - mptcp_pm_addr_send_ack(msk); - - if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) - return; - - /* pick id 0 port, if none is provided the remote address */ - if (!remote.port) - remote.port = sk->sk_dport; - - /* connect to the specified remote address, using whatever - * local address the routing configuration will pick. - */ - nr = fill_local_addresses_vec(msk, &remote, locals); - if (nr == 0) - return; - - spin_unlock_bh(&msk->pm.lock); - for (i = 0; i < nr; i++) - if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0) - sf_created = true; - spin_lock_bh(&msk->pm.lock); - - if (sf_created) { - /* add_addr_accepted is not decr for ID 0 */ - if (remote.id) - msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) - WRITE_ONCE(msk->pm.accept_addr, false); - } -} - -void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) -{ - if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { - /* Note: if the subflow has been closed before, this - * add_addr_accepted counter will not be decremented. - */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) - WRITE_ONCE(msk->pm.accept_addr, true); - } -} - -/* Called under PM lock */ -void __mptcp_pm_kernel_worker(struct mptcp_sock *msk) -{ - struct mptcp_pm_data *pm = &msk->pm; - - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); - } -} - -static bool address_use_port(struct mptcp_pm_addr_entry *entry) -{ - return (entry->flags & - (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == - MPTCP_PM_ADDR_FLAG_SIGNAL; -} - -/* caller must ensure the RCU grace period is already elapsed */ -static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) -{ - if (entry->lsk) - sock_release(entry->lsk); - kfree(entry); -} - -static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, - struct mptcp_pm_addr_entry *entry, - bool needs_id, bool replace) -{ - struct mptcp_pm_addr_entry *cur, *del_entry = NULL; - unsigned int addr_max; - int ret = -EINVAL; - - spin_lock_bh(&pernet->lock); - /* to keep the code simple, don't do IDR-like allocation for address ID, - * just bail when we exceed limits - */ - if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) - pernet->next_id = 1; - if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { - ret = -ERANGE; - goto out; - } - if (test_bit(entry->addr.id, pernet->id_bitmap)) { - ret = -EBUSY; - goto out; - } - - /* do not insert duplicate address, differentiate on port only - * singled addresses - */ - if (!address_use_port(entry)) - entry->addr.port = 0; - list_for_each_entry(cur, &pernet->local_addr_list, list) { - if (mptcp_addresses_equal(&cur->addr, &entry->addr, - cur->addr.port || entry->addr.port)) { - /* allow replacing the exiting endpoint only if such - * endpoint is an implicit one and the user-space - * did not provide an endpoint id - */ - if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { - ret = -EEXIST; - goto out; - } - if (entry->addr.id) - goto out; - - /* allow callers that only need to look up the local - * addr's id to skip replacement. This allows them to - * avoid calling synchronize_rcu in the packet recv - * path. - */ - if (!replace) { - kfree(entry); - ret = cur->addr.id; - goto out; - } - - pernet->addrs--; - entry->addr.id = cur->addr.id; - list_del_rcu(&cur->list); - del_entry = cur; - break; - } - } - - if (!entry->addr.id && needs_id) { -find_next: - entry->addr.id = find_next_zero_bit(pernet->id_bitmap, - MPTCP_PM_MAX_ADDR_ID + 1, - pernet->next_id); - if (!entry->addr.id && pernet->next_id != 1) { - pernet->next_id = 1; - goto find_next; - } - } - - if (!entry->addr.id && needs_id) - goto out; - - __set_bit(entry->addr.id, pernet->id_bitmap); - if (entry->addr.id > pernet->next_id) - pernet->next_id = entry->addr.id; - - if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max + 1); - } - - pernet->addrs++; - if (!entry->addr.port) - list_add_tail_rcu(&entry->list, &pernet->local_addr_list); - else - list_add_rcu(&entry->list, &pernet->local_addr_list); - ret = entry->addr.id; - -out: - spin_unlock_bh(&pernet->lock); - - /* just replaced an existing entry, free it */ - if (del_entry) { - synchronize_rcu(); - __mptcp_pm_release_addr_entry(del_entry); - } - return ret; -} - -static struct lock_class_key mptcp_slock_keys[2]; -static struct lock_class_key mptcp_keys[2]; - -static int mptcp_pm_nl_create_listen_socket(struct sock *sk, - struct mptcp_pm_addr_entry *entry) -{ - bool is_ipv6 = sk->sk_family == AF_INET6; - int addrlen = sizeof(struct sockaddr_in); - struct sockaddr_storage addr; - struct sock *newsk, *ssk; - int backlog = 1024; - int err; - - err = sock_create_kern(sock_net(sk), entry->addr.family, - SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); - if (err) - return err; - - newsk = entry->lsk->sk; - if (!newsk) - return -EINVAL; - - /* The subflow socket lock is acquired in a nested to the msk one - * in several places, even by the TCP stack, and this msk is a kernel - * socket: lockdep complains. Instead of propagating the _nested - * modifiers in several places, re-init the lock class for the msk - * socket to an mptcp specific one. - */ - sock_lock_init_class_and_name(newsk, - is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", - &mptcp_slock_keys[is_ipv6], - is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", - &mptcp_keys[is_ipv6]); - - lock_sock(newsk); - ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); - release_sock(newsk); - if (IS_ERR(ssk)) - return PTR_ERR(ssk); - - mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (entry->addr.family == AF_INET6) - addrlen = sizeof(struct sockaddr_in6); -#endif - if (ssk->sk_family == AF_INET) - err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (ssk->sk_family == AF_INET6) - err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); -#endif - if (err) - return err; - - /* We don't use mptcp_set_state() here because it needs to be called - * under the msk socket lock. For the moment, that will not bring - * anything more than only calling inet_sk_state_store(), because the - * old status is known (TCP_CLOSE). - */ - inet_sk_state_store(newsk, TCP_LISTEN); - lock_sock(ssk); - WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); - err = __inet_listen_sk(ssk, backlog); - if (!err) - mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); - release_sock(ssk); - return err; -} - -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, - struct mptcp_pm_addr_entry *skc) -{ - struct mptcp_pm_addr_entry *entry; - struct pm_nl_pernet *pernet; - int ret; - - pernet = pm_nl_get_pernet_from_msk(msk); - - rcu_read_lock(); - entry = __lookup_addr(pernet, &skc->addr); - ret = entry ? entry->addr.id : -1; - rcu_read_unlock(); - if (ret >= 0) - return ret; - - /* address not found, add to local list */ - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) - return -ENOMEM; - - *entry = *skc; - entry->addr.port = 0; - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); - if (ret < 0) - kfree(entry); - - return ret; -} - -bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - struct mptcp_pm_addr_entry *entry; - bool backup; - - rcu_read_lock(); - entry = __lookup_addr(pernet, skc); - backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - rcu_read_unlock(); - - return backup; -} - #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 @@ -886,381 +127,6 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, return 0; } -static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) -{ - return pm_nl_get_pernet(genl_info_net(info)); -} - -static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, - struct mptcp_addr_info *addr) -{ - struct mptcp_sock *msk; - long s_slot = 0, s_num = 0; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - struct mptcp_addr_info mpc_addr; - - if (!READ_ONCE(msk->fully_established) || - mptcp_pm_is_userspace(msk)) - goto next; - - /* if the endp linked to the init sf is re-added with a != ID */ - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) - msk->mpc_endpoint_id = addr->id; - mptcp_pm_create_subflow_or_signal_addr(msk); - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return 0; -} - -static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, - struct genl_info *info) -{ - struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; - - if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, - mptcp_pm_address_nl_policy, info->extack) && - tb[MPTCP_PM_ADDR_ATTR_ID]) - return true; - return false; -} - -int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; - struct nlattr *attr; - int ret; - - if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) - return -EINVAL; - - attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - ret = mptcp_pm_parse_entry(attr, info, true, &addr); - if (ret < 0) - return ret; - - if (addr.addr.port && !address_use_port(&addr)) { - NL_SET_ERR_MSG_ATTR(info->extack, attr, - "flags must have signal and not subflow when using port"); - return -EINVAL; - } - - if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && - addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { - NL_SET_ERR_MSG_ATTR(info->extack, attr, - "flags mustn't have both signal and fullmesh"); - return -EINVAL; - } - - if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { - NL_SET_ERR_MSG_ATTR(info->extack, attr, - "can't create IMPLICIT endpoint"); - return -EINVAL; - } - - entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); - if (!entry) { - GENL_SET_ERR_MSG(info, "can't allocate addr"); - return -ENOMEM; - } - - *entry = addr; - if (entry->addr.port) { - ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); - if (ret) { - GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); - goto out_free; - } - } - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, - !mptcp_pm_has_addr_attr_id(attr, info), - true); - if (ret < 0) { - GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); - goto out_free; - } - - mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); - return 0; - -out_free: - __mptcp_pm_release_addr_entry(entry); - return ret; -} - -static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; -} - -static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr, - bool force) -{ - struct mptcp_rm_list list = { .nr = 0 }; - bool ret; - - list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - - ret = mptcp_remove_anno_list_by_saddr(msk, addr); - if (ret || force) { - spin_lock_bh(&msk->pm.lock); - if (ret) { - __set_bit(addr->id, msk->pm.id_avail_bitmap); - msk->pm.add_addr_signaled--; - } - mptcp_pm_remove_addr(msk, &list); - spin_unlock_bh(&msk->pm.lock); - } - return ret; -} - -static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) -{ - /* If it was marked as used, and not ID 0, decrement local_addr_used */ - if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && - id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) - msk->pm.local_addr_used--; -} - -static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, - const struct mptcp_pm_addr_entry *entry) -{ - const struct mptcp_addr_info *addr = &entry->addr; - struct mptcp_rm_list list = { .nr = 1 }; - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - pr_debug("remove_id=%d\n", addr->id); - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - bool remove_subflow; - - if (mptcp_pm_is_userspace(msk)) - goto next; - - lock_sock(sk); - remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); - mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && - !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); - - list.ids[0] = mptcp_endp_get_local_id(msk, addr); - if (remove_subflow) { - spin_lock_bh(&msk->pm.lock); - mptcp_pm_rm_subflow(msk, &list); - spin_unlock_bh(&msk->pm.lock); - } - - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - spin_lock_bh(&msk->pm.lock); - __mark_subflow_endp_available(msk, list.ids[0]); - spin_unlock_bh(&msk->pm.lock); - } - - if (msk->mpc_endpoint_id == entry->addr.id) - msk->mpc_endpoint_id = 0; - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return 0; -} - -static int mptcp_nl_remove_id_zero_address(struct net *net, - struct mptcp_addr_info *addr) -{ - struct mptcp_rm_list list = { .nr = 0 }; - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - list.ids[list.nr++] = 0; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - struct mptcp_addr_info msk_local; - - if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) - goto next; - - mptcp_local_address((struct sock_common *)msk, &msk_local); - if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) - goto next; - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - mptcp_pm_remove_addr(msk, &list); - mptcp_pm_rm_subflow(msk, &list); - __mark_subflow_endp_available(msk, 0); - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return 0; -} - -int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; - unsigned int addr_max; - struct nlattr *attr; - int ret; - - if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) - return -EINVAL; - - attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - return ret; - - /* the zero id address is special: the first address used by the msk - * always gets such an id, so different subflows can have different zero - * id addresses. Additionally zero id is not accounted for in id_bitmap. - * Let's use an 'mptcp_rm_list' instead of the common remove code. - */ - if (addr.addr.id == 0) - return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); - - spin_lock_bh(&pernet->lock); - entry = __lookup_addr_by_id(pernet, addr.addr.id); - if (!entry) { - NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); - spin_unlock_bh(&pernet->lock); - return -EINVAL; - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max - 1); - } - - pernet->addrs--; - list_del_rcu(&entry->list); - __clear_bit(entry->addr.id, pernet->id_bitmap); - spin_unlock_bh(&pernet->lock); - - mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); - synchronize_rcu(); - __mptcp_pm_release_addr_entry(entry); - - return ret; -} - -static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, - struct list_head *rm_list) -{ - struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry(entry, rm_list, list) { - if (slist.nr < MPTCP_RM_IDS_MAX && - mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) - slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); - - if (alist.nr < MPTCP_RM_IDS_MAX && - mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) - alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); - } - - spin_lock_bh(&msk->pm.lock); - if (alist.nr) { - msk->pm.add_addr_signaled -= alist.nr; - mptcp_pm_remove_addr(msk, &alist); - } - if (slist.nr) - mptcp_pm_rm_subflow(msk, &slist); - /* Reset counters: maybe some subflows have been removed before */ - bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - msk->pm.local_addr_used = 0; - spin_unlock_bh(&msk->pm.lock); -} - -static void mptcp_nl_flush_addrs_list(struct net *net, - struct list_head *rm_list) -{ - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - if (list_empty(rm_list)) - return; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - - if (!mptcp_pm_is_userspace(msk)) { - lock_sock(sk); - mptcp_pm_flush_addrs_and_subflows(msk, rm_list); - release_sock(sk); - } - - sock_put(sk); - cond_resched(); - } -} - -/* caller must ensure the RCU grace period is already elapsed */ -static void __flush_addrs(struct list_head *list) -{ - while (!list_empty(list)) { - struct mptcp_pm_addr_entry *cur; - - cur = list_entry(list->next, - struct mptcp_pm_addr_entry, list); - list_del_rcu(&cur->list); - __mptcp_pm_release_addr_entry(cur); - } -} - -static void __reset_counters(struct pm_nl_pernet *pernet) -{ - WRITE_ONCE(pernet->add_addr_signal_max, 0); - WRITE_ONCE(pernet->add_addr_accept_max, 0); - WRITE_ONCE(pernet->local_addr_max, 0); - pernet->addrs = 0; -} - -int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - LIST_HEAD(free_list); - - spin_lock_bh(&pernet->lock); - list_splice_init(&pernet->local_addr_list, &free_list); - __reset_counters(pernet); - pernet->next_id = 1; - bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - spin_unlock_bh(&pernet->lock); - mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); - synchronize_rcu(); - __flush_addrs(&free_list); - return 0; -} - int mptcp_nl_fill_addr(struct sk_buff *skb, struct mptcp_pm_addr_entry *entry) { @@ -1300,226 +166,6 @@ int mptcp_nl_fill_addr(struct sk_buff *skb, return -EMSGSIZE; } -int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, - struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry *entry; - int ret = -EINVAL; - - rcu_read_lock(); - entry = __lookup_addr_by_id(pernet, id); - if (entry) { - *addr = *entry; - ret = 0; - } - rcu_read_unlock(); - - return ret; -} - -int mptcp_pm_nl_dump_addr(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct net *net = sock_net(msg->sk); - struct mptcp_pm_addr_entry *entry; - struct pm_nl_pernet *pernet; - int id = cb->args[0]; - int i; - - pernet = pm_nl_get_pernet(net); - - rcu_read_lock(); - for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { - if (test_bit(i, pernet->id_bitmap)) { - entry = __lookup_addr_by_id(pernet, i); - if (!entry) - break; - - if (entry->addr.id <= id) - continue; - - if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) - break; - - id = entry->addr.id; - } - } - rcu_read_unlock(); - - cb->args[0] = id; - return msg->len; -} - -static int parse_limit(struct genl_info *info, int id, unsigned int *limit) -{ - struct nlattr *attr = info->attrs[id]; - - if (!attr) - return 0; - - *limit = nla_get_u32(attr); - if (*limit > MPTCP_PM_ADDR_MAX) { - NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr, - "limit greater than maximum (%u)", - MPTCP_PM_ADDR_MAX); - return -EINVAL; - } - return 0; -} - -int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - unsigned int rcv_addrs, subflows; - int ret; - - spin_lock_bh(&pernet->lock); - rcv_addrs = pernet->add_addr_accept_max; - ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); - if (ret) - goto unlock; - - subflows = pernet->subflows_max; - ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); - if (ret) - goto unlock; - - WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); - WRITE_ONCE(pernet->subflows_max, subflows); - -unlock: - spin_unlock_bh(&pernet->lock); - return ret; -} - -int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct sk_buff *msg; - void *reply; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, - MPTCP_PM_CMD_GET_LIMITS); - if (!reply) - goto fail; - - if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, - READ_ONCE(pernet->add_addr_accept_max))) - goto fail; - - if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, - READ_ONCE(pernet->subflows_max))) - goto fail; - - genlmsg_end(msg, reply); - return genlmsg_reply(msg, info); - -fail: - GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); - nlmsg_free(msg); - return -EMSGSIZE; -} - -static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) -{ - struct mptcp_rm_list list = { .nr = 0 }; - - list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - - spin_lock_bh(&msk->pm.lock); - mptcp_pm_rm_subflow(msk, &list); - __mark_subflow_endp_available(msk, list.ids[0]); - mptcp_pm_create_subflow_or_signal_addr(msk); - spin_unlock_bh(&msk->pm.lock); -} - -static void mptcp_pm_nl_set_flags_all(struct net *net, - struct mptcp_pm_addr_entry *local, - u8 changed) -{ - u8 is_subflow = !!(local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW); - u8 bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - if (changed == MPTCP_PM_ADDR_FLAG_FULLMESH && !is_subflow) - return; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - - if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) - goto next; - - lock_sock(sk); - if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) - mptcp_pm_mp_prio_send_ack(msk, &local->addr, NULL, bkup); - /* Subflows will only be recreated if the SUBFLOW flag is set */ - if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)) - mptcp_pm_nl_fullmesh(msk, &local->addr); - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return; -} - -int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, - struct genl_info *info) -{ - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; - u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | - MPTCP_PM_ADDR_FLAG_FULLMESH; - struct net *net = genl_info_net(info); - struct mptcp_pm_addr_entry *entry; - struct pm_nl_pernet *pernet; - u8 lookup_by_id = 0; - - pernet = pm_nl_get_pernet(net); - - if (local->addr.family == AF_UNSPEC) { - lookup_by_id = 1; - if (!local->addr.id) { - NL_SET_ERR_MSG_ATTR(info->extack, attr, - "missing address ID"); - return -EOPNOTSUPP; - } - } - - spin_lock_bh(&pernet->lock); - entry = lookup_by_id ? __lookup_addr_by_id(pernet, local->addr.id) : - __lookup_addr(pernet, &local->addr); - if (!entry) { - spin_unlock_bh(&pernet->lock); - NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); - return -EINVAL; - } - if ((local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && - (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | - MPTCP_PM_ADDR_FLAG_IMPLICIT))) { - spin_unlock_bh(&pernet->lock); - NL_SET_ERR_MSG_ATTR(info->extack, attr, "invalid addr flags"); - return -EINVAL; - } - - changed = (local->flags ^ entry->flags) & mask; - entry->flags = (entry->flags & ~mask) | (local->flags & mask); - *local = *entry; - spin_unlock_bh(&pernet->lock); - - mptcp_pm_nl_set_flags_all(net, local, changed); - return 0; -} - static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) { genlmsg_multicast_netns(&mptcp_genl_family, net, @@ -1864,53 +510,3 @@ struct genl_family mptcp_genl_family __ro_after_init = { .mcgrps = mptcp_pm_mcgrps, .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), }; - -static int __net_init pm_nl_init_net(struct net *net) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); - - INIT_LIST_HEAD_RCU(&pernet->local_addr_list); - - /* Cit. 2 subflows ought to be enough for anybody. */ - pernet->subflows_max = 2; - pernet->next_id = 1; - pernet->stale_loss_cnt = 4; - spin_lock_init(&pernet->lock); - - /* No need to initialize other pernet fields, the struct is zeroed at - * allocation time. - */ - - return 0; -} - -static void __net_exit pm_nl_exit_net(struct list_head *net_list) -{ - struct net *net; - - list_for_each_entry(net, net_list, exit_list) { - struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); - - /* net is removed from namespace list, can't race with - * other modifiers, also netns core already waited for a - * RCU grace period. - */ - __flush_addrs(&pernet->local_addr_list); - } -} - -static struct pernet_operations mptcp_pm_pernet_ops = { - .init = pm_nl_init_net, - .exit_batch = pm_nl_exit_net, - .id = &pm_nl_pernet_id, - .size = sizeof(struct pm_nl_pernet), -}; - -void __init mptcp_pm_nl_init(void) -{ - if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) - panic("Failed to register MPTCP PM pernet subsystem.\n"); - - if (genl_register_family(&mptcp_genl_family)) - panic("Failed to register MPTCP PM netlink family\n"); -} From 2e7e6e9cda1e330569bc0824e07abf53addc376f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:59 +0100 Subject: [PATCH 15/15] mptcp: pm: move Netlink PM helpers to pm_netlink.c Before this patch, the PM code was dispersed in different places: - pm.c had common code for all PMs, but also Netlink specific code that will not be needed with the future BPF path-managers. - pm_netlink.c had common Netlink code. To clarify the code, a reorganisation is suggested here, only by moving code around, and small helper renaming to avoid confusions: - pm_netlink.c now only contains common PM Netlink code: - PM events: this code was already there - shared helpers around Netlink code that were already there as well - shared Netlink commands code from pm.c - pm.c now no longer contain Netlink specific code. - protocol.h has been updated accordingly: - mptcp_nl_fill_addr() no longer need to be exported. The code around the PM is now less confusing, which should help for the maintenance in the long term. This will certainly impact future backports, but because other cleanups have already done recently, and more are coming to ease the addition of a new path-manager controlled with BPF (struct_ops), doing that now seems to be a good time. Also, many issues around the PM have been fixed a few months ago while increasing the code coverage in the selftests, so such big reorganisation can be done with more confidence now. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-15-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 119 ----------------------------------------- net/mptcp/pm_netlink.c | 119 ++++++++++++++++++++++++++++++++++++++++- net/mptcp/protocol.h | 2 - 3 files changed, 117 insertions(+), 123 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index d02a0b3adfc4..833839d7286e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -5,12 +5,8 @@ */ #define pr_fmt(fmt) "MPTCP: " fmt -#include -#include #include "protocol.h" - #include "mib.h" -#include "mptcp_pm_gen.h" #define ADD_ADDR_RETRANS_MAX 3 @@ -888,121 +884,6 @@ bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_is_backup(msk, &skc_local); } -static int mptcp_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, - struct genl_info *info) -{ - if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_get_addr(id, addr, info); - return mptcp_pm_nl_get_addr(id, addr, info); -} - -int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct mptcp_pm_addr_entry addr; - struct nlattr *attr; - struct sk_buff *msg; - void *reply; - int ret; - - if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) - return -EINVAL; - - attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - return ret; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, - info->genlhdr->cmd); - if (!reply) { - GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); - ret = -EMSGSIZE; - goto fail; - } - - ret = mptcp_pm_get_addr(addr.addr.id, &addr, info); - if (ret) { - NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); - goto fail; - } - - ret = mptcp_nl_fill_addr(msg, &addr); - if (ret) - goto fail; - - genlmsg_end(msg, reply); - ret = genlmsg_reply(msg, info); - return ret; - -fail: - nlmsg_free(msg); - return ret; -} - -int mptcp_pm_genl_fill_addr(struct sk_buff *msg, - struct netlink_callback *cb, - struct mptcp_pm_addr_entry *entry) -{ - void *hdr; - - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) - return -EINVAL; - - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); - return -EINVAL; - } - - genlmsg_end(msg, hdr); - return 0; -} - -static int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) -{ - const struct genl_info *info = genl_info_dump(cb); - - if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_dump_addr(msg, cb); - return mptcp_pm_nl_dump_addr(msg, cb); -} - -int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - return mptcp_pm_dump_addr(msg, cb); -} - -static int mptcp_pm_set_flags(struct genl_info *info) -{ - struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; - struct nlattr *attr_loc; - int ret = -EINVAL; - - if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) - return ret; - - attr_loc = info->attrs[MPTCP_PM_ATTR_ADDR]; - ret = mptcp_pm_parse_entry(attr_loc, info, false, &loc); - if (ret < 0) - return ret; - - if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_set_flags(&loc, info); - return mptcp_pm_nl_set_flags(&loc, info); -} - -int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) -{ - return mptcp_pm_set_flags(info); -} - static void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 530b2362a5a3..b2e5bbdcd5df 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -127,8 +127,8 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, return 0; } -int mptcp_nl_fill_addr(struct sk_buff *skb, - struct mptcp_pm_addr_entry *entry) +static int mptcp_nl_fill_addr(struct sk_buff *skb, + struct mptcp_pm_addr_entry *entry) { struct mptcp_addr_info *addr = &entry->addr; struct nlattr *attr; @@ -166,6 +166,121 @@ int mptcp_nl_fill_addr(struct sk_buff *skb, return -EMSGSIZE; } +static int mptcp_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info) +{ + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_get_addr(id, addr, info); + return mptcp_pm_nl_get_addr(id, addr, info); +} + +int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct mptcp_pm_addr_entry addr; + struct nlattr *attr; + struct sk_buff *msg; + void *reply; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; + ret = mptcp_pm_parse_entry(attr, info, false, &addr); + if (ret < 0) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + info->genlhdr->cmd); + if (!reply) { + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + ret = -EMSGSIZE; + goto fail; + } + + ret = mptcp_pm_get_addr(addr.addr.id, &addr, info); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + goto fail; + } + + ret = mptcp_nl_fill_addr(msg, &addr); + if (ret) + goto fail; + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + return ret; + +fail: + nlmsg_free(msg); + return ret; +} + +int mptcp_pm_genl_fill_addr(struct sk_buff *msg, + struct netlink_callback *cb, + struct mptcp_pm_addr_entry *entry) +{ + void *hdr; + + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + return -EINVAL; + + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); + return -EINVAL; + } + + genlmsg_end(msg, hdr); + return 0; +} + +static int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) +{ + const struct genl_info *info = genl_info_dump(cb); + + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_dump_addr(msg, cb); + return mptcp_pm_nl_dump_addr(msg, cb); +} + +int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return mptcp_pm_dump_addr(msg, cb); +} + +static int mptcp_pm_set_flags(struct genl_info *info) +{ + struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; + struct nlattr *attr_loc; + int ret = -EINVAL; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) + return ret; + + attr_loc = info->attrs[MPTCP_PM_ATTR_ADDR]; + ret = mptcp_pm_parse_entry(attr_loc, info, false, &loc); + if (ret < 0) + return ret; + + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_set_flags(&loc, info); + return mptcp_pm_nl_set_flags(&loc, info); +} + +int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) +{ + return mptcp_pm_set_flags(info); +} + static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) { genlmsg_multicast_netns(&mptcp_genl_family, net, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 360d8cfa5279..c51b6a22d5e0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1057,8 +1057,6 @@ bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); -int mptcp_nl_fill_addr(struct sk_buff *skb, - struct mptcp_pm_addr_entry *entry); int mptcp_pm_genl_fill_addr(struct sk_buff *msg, struct netlink_callback *cb, struct mptcp_pm_addr_entry *entry);