From c026d33b8f5081969503e322ea30d7c1a6b17dda Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:14 +0200 Subject: [PATCH 1/9] mptcp: move snd_una update earlier for fallback socket That will avoid an unneeded conditional in both the fast-path and in the fallback case and will simplify a bit the next patch. Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 ++++++ net/mptcp/protocol.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8a8083207be4c..4bdcd2b326bd4 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1119,6 +1119,12 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mptcp_data_lock(subflow->conn); if (sk_stream_memory_free(sk)) __mptcp_check_push(subflow->conn, sk); + + /* on fallback we just need to ignore the msk-level snd_una, as + * this is really plain TCP + */ + msk->snd_una = READ_ONCE(msk->snd_nxt); + __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); return true; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 992b89c756312..9c756d675d4d1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1004,12 +1004,6 @@ static void __mptcp_clean_una(struct sock *sk) struct mptcp_data_frag *dtmp, *dfrag; u64 snd_una; - /* on fallback we just need to ignore snd_una, as this is really - * plain TCP - */ - if (__mptcp_check_fallback(msk)) - msk->snd_una = READ_ONCE(msk->snd_nxt); - snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) From 38967f424b5be79c4c676712e5640d846efd07e3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:15 +0200 Subject: [PATCH 2/9] mptcp: track some aggregate data counters Currently there are no data transfer counters accounting for all the subflows used by a given MPTCP socket. The user-space can compute such figures aggregating the subflow info, but that is inaccurate if any subflow is closed before the MPTCP socket itself. Add the new counters in the MPTCP socket itself and expose them via the existing diag and sockopt. While touching mptcp_diag_fill_info(), acquire the relevant locks before fetching the msk data, to ensure better data consistency Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/385 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 5 +++++ net/mptcp/options.c | 10 ++++++++-- net/mptcp/protocol.c | 11 ++++++++++- net/mptcp/protocol.h | 4 ++++ net/mptcp/sockopt.c | 25 ++++++++++++++++++++----- 5 files changed, 47 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 32af2d278cb47..a124be6ebbbaf 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -123,6 +123,11 @@ struct mptcp_info { __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; __u8 mptcpi_csum_enabled; + __u32 mptcpi_retransmits; + __u64 mptcpi_bytes_retrans; + __u64 mptcpi_bytes_sent; + __u64 mptcpi_bytes_received; + __u64 mptcpi_bytes_acked; }; /* diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 4bdcd2b326bd4..c254accb14dee 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1026,6 +1026,12 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) return cur_seq; } +static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) +{ + msk->bytes_acked += new_snd_una - msk->snd_una; + msk->snd_una = new_snd_una; +} + static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) @@ -1057,7 +1063,7 @@ static void ack_update_msk(struct mptcp_sock *msk, __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { - msk->snd_una = new_snd_una; + __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } mptcp_data_unlock(sk); @@ -1123,7 +1129,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) /* on fallback we just need to ignore the msk-level snd_una, as * this is really plain TCP */ - msk->snd_una = READ_ONCE(msk->snd_nxt); + __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9c756d675d4d1..d5b8e488bce16 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -377,6 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ + msk->bytes_received += copy_len; WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) @@ -760,6 +761,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } + msk->bytes_received += end_seq - msk->ack_seq; msk->ack_seq = end_seq; moved = true; } @@ -1531,8 +1533,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, * that has been handed to the subflow for transmission * and skip update in case it was old dfrag. */ - if (likely(after64(snd_nxt_new, msk->snd_nxt))) + if (likely(after64(snd_nxt_new, msk->snd_nxt))) { + msk->bytes_sent += snd_nxt_new - msk->snd_nxt; msk->snd_nxt = snd_nxt_new; + } } void mptcp_check_and_set_pending(struct sock *sk) @@ -2590,6 +2594,7 @@ static void __mptcp_retrans(struct sock *sk) } if (copied) { dfrag->already_sent = max(dfrag->already_sent, info.sent); + msk->bytes_retrans += copied; tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); WRITE_ONCE(msk->allow_infinite_fallback, false); @@ -3102,6 +3107,10 @@ static int mptcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); + msk->bytes_acked = 0; + msk->bytes_received = 0; + msk->bytes_sent = 0; + msk->bytes_retrans = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 47b46602870ea..27adfcc5aaa2c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -262,10 +262,13 @@ struct mptcp_sock { u64 local_key; u64 remote_key; u64 write_seq; + u64 bytes_sent; u64 snd_nxt; + u64 bytes_received; u64 ack_seq; atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; + u64 bytes_retrans; int rmem_fwd_alloc; struct sock *last_snd; int snd_burst; @@ -274,6 +277,7 @@ struct mptcp_sock { * recovery related fields are under data_lock * protection */ + u64 bytes_acked; u64 snd_una; u64 wnd_end; unsigned long timer_ival; diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index e172a5848b0de..fa5055d5b029d 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -889,7 +889,9 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { + struct sock *sk = (struct sock *)msk; u32 flags = 0; + bool slow; memset(info, 0, sizeof(*info)); @@ -898,6 +900,9 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); + if (inet_sk_state_load(sk) == TCP_LISTEN) + return; + /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { info->mptcpi_subflows_max = @@ -915,11 +920,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (READ_ONCE(msk->can_ack)) flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; info->mptcpi_flags = flags; - info->mptcpi_token = READ_ONCE(msk->token); - info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = READ_ONCE(msk->snd_una); - info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); - info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); + mptcp_data_lock(sk); + info->mptcpi_snd_una = msk->snd_una; + info->mptcpi_rcv_nxt = msk->ack_seq; + info->mptcpi_bytes_acked = msk->bytes_acked; + mptcp_data_unlock(sk); + + slow = lock_sock_fast(sk); + info->mptcpi_csum_enabled = msk->csum_enabled; + info->mptcpi_token = msk->token; + info->mptcpi_write_seq = msk->write_seq; + info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits; + info->mptcpi_bytes_sent = msk->bytes_sent; + info->mptcpi_bytes_received = msk->bytes_received; + info->mptcpi_bytes_retrans = msk->bytes_retrans; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); From 5dcff89e14555a4ee6cfa132b82f7d13dcb1e80a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:16 +0200 Subject: [PATCH 3/9] selftests: mptcp: explicitly tests aggregate counters Update the existing sockopt test-case to do some basic checks on the newly added counters. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/385 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- .../selftests/net/mptcp/mptcp_sockopt.c | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index b35148edbf024..5ee710b30f100 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -51,6 +51,11 @@ struct mptcp_info { __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; __u8 mptcpi_csum_enabled; + __u32 mptcpi_retransmits; + __u64 mptcpi_bytes_retrans; + __u64 mptcpi_bytes_sent; + __u64 mptcpi_bytes_received; + __u64 mptcpi_bytes_acked; }; struct mptcp_subflow_data { @@ -83,8 +88,10 @@ struct mptcp_subflow_addrs { struct so_state { struct mptcp_info mi; + struct mptcp_info last_sample; uint64_t mptcpi_rcv_delta; uint64_t tcpi_rcv_delta; + bool pkt_stats_avail; }; #ifndef MIN @@ -322,8 +329,9 @@ static void do_getsockopt_mptcp_info(struct so_state *s, int fd, size_t w) if (ret < 0) die_perror("getsockopt MPTCP_INFO"); - assert(olen == sizeof(i)); + s->pkt_stats_avail = olen >= sizeof(i); + s->last_sample = i; if (s->mi.mptcpi_write_seq == 0) s->mi = i; @@ -562,6 +570,23 @@ static void process_one_client(int fd, int pipefd) do_getsockopts(&s, fd, ret, ret2); if (s.mptcpi_rcv_delta != (uint64_t)ret + 1) xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64, s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - ret); + + /* be nice when running on top of older kernel */ + if (s.pkt_stats_avail) { + if (s.last_sample.mptcpi_bytes_sent != ret2) + xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64, + s.last_sample.mptcpi_bytes_sent, ret2, + s.last_sample.mptcpi_bytes_sent - ret2); + if (s.last_sample.mptcpi_bytes_received != ret) + xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64, + s.last_sample.mptcpi_bytes_received, ret, + s.last_sample.mptcpi_bytes_received - ret); + if (s.last_sample.mptcpi_bytes_acked != ret) + xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64, + s.last_sample.mptcpi_bytes_acked, ret2, + s.last_sample.mptcpi_bytes_acked - ret2); + } + close(fd); } From 6f06b4d4d1cc676a3f9d947f931ec3866b6c4f6c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:17 +0200 Subject: [PATCH 4/9] mptcp: add subflow unique id The user-space need to properly account the data received/sent by individual subflows. When additional subflows are created and/or closed during the MPTCP socket lifetime, the information currently exposed via MPTCP_TCPINFO are not enough: subflows are identified only by the sequential position inside the info dumps, and that will change with the above mentioned events. To solve the above problem, this patch introduces a new subflow identifier that is unique inside the given MPTCP socket scope. The initial subflow get the id 1 and the other subflows get incremental values at join time. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/388 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 6 ++++++ net/mptcp/protocol.h | 5 ++++- net/mptcp/subflow.c | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d5b8e488bce16..4ebd6e9aa949e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -96,6 +96,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; + subflow->subflow_id = msk->subflow_id++; /* This is the first subflow, always with id 0 */ subflow->local_id_valid = 1; @@ -847,6 +848,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) if (sk->sk_socket && !ssk->sk_socket) mptcp_sock_graft(ssk, sk->sk_socket); + mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_subflow_joined(msk, ssk); return true; @@ -2732,6 +2734,7 @@ static int __mptcp_init_sock(struct sock *sk) WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; + msk->subflow_id = 1; mptcp_pm_data_init(msk); @@ -3160,6 +3163,9 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; + /* passive msk is created after the first/MPC subflow */ + msk->subflow_id = 2; + sock_reset_flag(nsk, SOCK_RCU_FREE); security_inet_csk_clone(nsk, req); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 27adfcc5aaa2c..bb4cacd927789 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -323,7 +323,8 @@ struct mptcp_sock { u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; - u32 setsockopt_seq; + u32 subflow_id; + u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; struct mptcp_sock *dl_next; }; @@ -504,6 +505,8 @@ struct mptcp_subflow_context { u8 reset_reason:4; u8 stale_count; + u32 subflow_id; + long delegated_status; unsigned long fail_tout; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 4688daa6b38b7..222dfcdadf2ee 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -819,6 +819,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, if (!ctx->conn) goto fallback; + ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); mptcp_pm_new_connection(owner, child, 1); @@ -1574,6 +1575,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, subflow->remote_id = remote_id; subflow->request_join = 1; subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); + subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); From 492432074e4fce4f8880213bf009b47adbf94a3a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:18 +0200 Subject: [PATCH 5/9] mptcp: introduce MPTCP_FULL_INFO getsockopt Some user-space applications want to monitor the subflows utilization. Dumping the per subflow tcp_info is not enough, as the PM could close and re-create the subflows under-the-hood, fooling the accounting. Even checking the src/dst addresses used by each subflow could not be enough, because new subflows could re-use the same address/port of the just closed one. This patch introduces a new socket option, allow dumping all the relevant information all-at-once (everything, everywhere...), in a consistent manner. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/388 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 24 +++++++ net/mptcp/sockopt.c | 127 ++++++++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index a124be6ebbbaf..ee9c49f949a2c 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -249,9 +249,33 @@ struct mptcp_subflow_addrs { }; }; +struct mptcp_subflow_info { + __u32 id; + struct mptcp_subflow_addrs addrs; +}; + +struct mptcp_full_info { + __u32 size_tcpinfo_kernel; /* must be 0, set by kernel */ + __u32 size_tcpinfo_user; + __u32 size_sfinfo_kernel; /* must be 0, set by kernel */ + __u32 size_sfinfo_user; + __u32 num_subflows; /* must be 0, set by kernel (real subflow count) */ + __u32 size_arrays_user; /* max subflows that userspace is interested in; + * the buffers at subflow_info/tcp_info + * are respectively at least: + * size_arrays * size_sfinfo_user + * size_arrays * size_tcpinfo_user + * bytes wide + */ + __aligned_u64 subflow_info; + __aligned_u64 tcp_info; + struct mptcp_info mptcp_info; +}; + /* MPTCP socket options */ #define MPTCP_INFO 1 #define MPTCP_TCPINFO 2 #define MPTCP_SUBFLOW_ADDRS 3 +#define MPTCP_FULL_INFO 4 #endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index fa5055d5b029d..63f7a09335c5c 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -14,7 +14,8 @@ #include #include "protocol.h" -#define MIN_INFO_OPTLEN_SIZE 16 +#define MIN_INFO_OPTLEN_SIZE 16 +#define MIN_FULL_INFO_OPTLEN_SIZE 40 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { @@ -981,7 +982,8 @@ static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd, } static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd, - char __user *optval, int __user *optlen) + char __user *optval, + int __user *optlen) { int len, copylen; @@ -1162,6 +1164,125 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o return 0; } +static int mptcp_get_full_info(struct mptcp_full_info *mfi, + char __user *optval, + int __user *optlen) +{ + int len; + + BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) != + MIN_FULL_INFO_OPTLEN_SIZE); + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < MIN_FULL_INFO_OPTLEN_SIZE) + return -EINVAL; + + memset(mfi, 0, sizeof(*mfi)); + if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE)) + return -EFAULT; + + if (mfi->size_tcpinfo_kernel || + mfi->size_sfinfo_kernel || + mfi->num_subflows) + return -EINVAL; + + if (mfi->size_sfinfo_user > INT_MAX || + mfi->size_tcpinfo_user > INT_MAX) + return -EINVAL; + + return len - MIN_FULL_INFO_OPTLEN_SIZE; +} + +static int mptcp_put_full_info(struct mptcp_full_info *mfi, + char __user *optval, + u32 copylen, + int __user *optlen) +{ + copylen += MIN_FULL_INFO_OPTLEN_SIZE; + if (put_user(copylen, optlen)) + return -EFAULT; + + if (copy_to_user(optval, mfi, copylen)) + return -EFAULT; + return 0; +} + +static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval, + int __user *optlen) +{ + unsigned int sfcount = 0, copylen = 0; + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + void __user *tcpinfoptr, *sfinfoptr; + struct mptcp_full_info mfi; + int len; + + len = mptcp_get_full_info(&mfi, optval, optlen); + if (len < 0) + return len; + + /* don't bother filling the mptcp info if there is not enough + * user-space-provided storage + */ + if (len > 0) { + mptcp_diag_fill_info(msk, &mfi.mptcp_info); + copylen += min_t(unsigned int, len, sizeof(struct mptcp_info)); + } + + mfi.size_tcpinfo_kernel = sizeof(struct tcp_info); + mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user, + sizeof(struct tcp_info)); + sfinfoptr = u64_to_user_ptr(mfi.subflow_info); + mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info); + mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user, + sizeof(struct mptcp_subflow_info)); + tcpinfoptr = u64_to_user_ptr(mfi.tcp_info); + + lock_sock(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct mptcp_subflow_info sfinfo; + struct tcp_info tcp_info; + + if (sfcount++ >= mfi.size_arrays_user) + continue; + + /* fetch addr/tcp_info only if the user space buffers + * are wide enough + */ + memset(&sfinfo, 0, sizeof(sfinfo)); + sfinfo.id = subflow->subflow_id; + if (mfi.size_sfinfo_user > + offsetof(struct mptcp_subflow_info, addrs)) + mptcp_get_sub_addrs(ssk, &sfinfo.addrs); + if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user)) + goto fail_release; + + if (mfi.size_tcpinfo_user) { + tcp_get_info(ssk, &tcp_info); + if (copy_to_user(tcpinfoptr, &tcp_info, + mfi.size_tcpinfo_user)) + goto fail_release; + } + + tcpinfoptr += mfi.size_tcpinfo_user; + sfinfoptr += mfi.size_sfinfo_user; + } + release_sock(sk); + + mfi.num_subflows = sfcount; + if (mptcp_put_full_info(&mfi, optval, copylen, optlen)) + return -EFAULT; + + return 0; + +fail_release: + release_sock(sk); + return -EFAULT; +} + static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, int __user *optlen, int val) { @@ -1235,6 +1356,8 @@ static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, switch (optname) { case MPTCP_INFO: return mptcp_getsockopt_info(msk, optval, optlen); + case MPTCP_FULL_INFO: + return mptcp_getsockopt_full_info(msk, optval, optlen); case MPTCP_TCPINFO: return mptcp_getsockopt_tcpinfo(msk, optval, optlen); case MPTCP_SUBFLOW_ADDRS: From aa723d5b3541bfcf9b7493fc1c47e6af6a11b765 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:19 +0200 Subject: [PATCH 6/9] selftests: mptcp: add MPTCP_FULL_INFO testcase Add a testcase explicitly triggering the newly introduce MPTCP_FULL_INFO getsockopt. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/388 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- .../selftests/net/mptcp/mptcp_sockopt.c | 93 ++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index 5ee710b30f100..926b0be87c990 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -86,9 +86,38 @@ struct mptcp_subflow_addrs { #define MPTCP_SUBFLOW_ADDRS 3 #endif +#ifndef MPTCP_FULL_INFO +struct mptcp_subflow_info { + __u32 id; + struct mptcp_subflow_addrs addrs; +}; + +struct mptcp_full_info { + __u32 size_tcpinfo_kernel; /* must be 0, set by kernel */ + __u32 size_tcpinfo_user; + __u32 size_sfinfo_kernel; /* must be 0, set by kernel */ + __u32 size_sfinfo_user; + __u32 num_subflows; /* must be 0, set by kernel (real subflow count) */ + __u32 size_arrays_user; /* max subflows that userspace is interested in; + * the buffers at subflow_info/tcp_info + * are respectively at least: + * size_arrays * size_sfinfo_user + * size_arrays * size_tcpinfo_user + * bytes wide + */ + __aligned_u64 subflow_info; + __aligned_u64 tcp_info; + struct mptcp_info mptcp_info; +}; + +#define MPTCP_FULL_INFO 4 +#endif + struct so_state { struct mptcp_info mi; struct mptcp_info last_sample; + struct tcp_info tcp_info; + struct mptcp_subflow_addrs addrs; uint64_t mptcpi_rcv_delta; uint64_t tcpi_rcv_delta; bool pkt_stats_avail; @@ -370,6 +399,8 @@ static void do_getsockopt_tcp_info(struct so_state *s, int fd, size_t r, size_t olen -= sizeof(struct mptcp_subflow_data); assert(olen == ti.d.size_user); + s->tcp_info = ti.ti[0]; + if (ti.ti[0].tcpi_bytes_sent == w && ti.ti[0].tcpi_bytes_received == r) goto done; @@ -391,7 +422,7 @@ static void do_getsockopt_tcp_info(struct so_state *s, int fd, size_t r, size_t do_getsockopt_bogus_sf_data(fd, MPTCP_TCPINFO); } -static void do_getsockopt_subflow_addrs(int fd) +static void do_getsockopt_subflow_addrs(struct so_state *s, int fd) { struct sockaddr_storage remote, local; socklen_t olen, rlen, llen; @@ -439,6 +470,7 @@ static void do_getsockopt_subflow_addrs(int fd) assert(memcmp(&local, &addrs.addr[0].ss_local, sizeof(local)) == 0); assert(memcmp(&remote, &addrs.addr[0].ss_remote, sizeof(remote)) == 0); + s->addrs = addrs.addr[0]; memset(&addrs, 0, sizeof(addrs)); @@ -459,13 +491,70 @@ static void do_getsockopt_subflow_addrs(int fd) do_getsockopt_bogus_sf_data(fd, MPTCP_SUBFLOW_ADDRS); } +static void do_getsockopt_mptcp_full_info(struct so_state *s, int fd) +{ + size_t data_size = sizeof(struct mptcp_full_info); + struct mptcp_subflow_info sfinfo[2]; + struct tcp_info tcp_info[2]; + struct mptcp_full_info mfi; + socklen_t olen; + int ret; + + memset(&mfi, 0, data_size); + memset(tcp_info, 0, sizeof(tcp_info)); + memset(sfinfo, 0, sizeof(sfinfo)); + + mfi.size_tcpinfo_user = sizeof(struct tcp_info); + mfi.size_sfinfo_user = sizeof(struct mptcp_subflow_info); + mfi.size_arrays_user = 2; + mfi.subflow_info = (unsigned long)&sfinfo[0]; + mfi.tcp_info = (unsigned long)&tcp_info[0]; + olen = data_size; + + ret = getsockopt(fd, SOL_MPTCP, MPTCP_FULL_INFO, &mfi, &olen); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + perror("MPTCP_FULL_INFO test skipped"); + return; + } + xerror("getsockopt MPTCP_FULL_INFO"); + } + + assert(olen <= data_size); + assert(mfi.size_tcpinfo_kernel > 0); + assert(mfi.size_tcpinfo_user == + MIN(mfi.size_tcpinfo_kernel, sizeof(struct tcp_info))); + assert(mfi.size_sfinfo_kernel > 0); + assert(mfi.size_sfinfo_user == + MIN(mfi.size_sfinfo_kernel, sizeof(struct mptcp_subflow_info))); + assert(mfi.num_subflows == 1); + + /* Tolerate future extension to mptcp_info struct and running newer + * test on top of older kernel. + * Anyway any kernel supporting MPTCP_FULL_INFO must at least include + * the following in mptcp_info. + */ + assert(olen > (socklen_t)__builtin_offsetof(struct mptcp_full_info, tcp_info)); + assert(mfi.mptcp_info.mptcpi_subflows == 0); + assert(mfi.mptcp_info.mptcpi_bytes_sent == s->last_sample.mptcpi_bytes_sent); + assert(mfi.mptcp_info.mptcpi_bytes_received == s->last_sample.mptcpi_bytes_received); + + assert(sfinfo[0].id == 1); + assert(tcp_info[0].tcpi_bytes_sent == s->tcp_info.tcpi_bytes_sent); + assert(tcp_info[0].tcpi_bytes_received == s->tcp_info.tcpi_bytes_received); + assert(!memcmp(&sfinfo->addrs, &s->addrs, sizeof(struct mptcp_subflow_addrs))); +} + static void do_getsockopts(struct so_state *s, int fd, size_t r, size_t w) { do_getsockopt_mptcp_info(s, fd, w); do_getsockopt_tcp_info(s, fd, r, w); - do_getsockopt_subflow_addrs(fd); + do_getsockopt_subflow_addrs(s, fd); + + if (r) + do_getsockopt_mptcp_full_info(s, fd); } static void connect_one_server(int fd, int pipefd) From 00079f18c24f373797cc38273c1bc0b475469d2b Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 20 Jun 2023 18:30:20 +0200 Subject: [PATCH 7/9] selftests: mptcp: join: skip check if MIB counter not supported (part 2) Selftests are supposed to run on any kernels, including the old ones not supporting all MPTCP features. One of them is the MPTCP MIB counters introduced in commit fc518953bc9c ("mptcp: add and use MIB counter infrastructure") and more later. The MPTCP Join selftest heavily relies on these counters. If a counter is not supported by the kernel, it is not displayed when using 'nstat -z'. We can then detect that and skip the verification. A new helper (get_counter()) has been added recently in the -net tree to do the required checks and return an error if the counter is not available. This commit is similar to the one with the same title applied in the -net tree but it modifies code only present in net-next for the moment, see the Fixes commit below. While at it, we can also remove the use of ${extra_msg} variable which is never assigned in chk_rm_tx_nr() function and use 'echo' without '-n' parameter. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: 0639fa230a21 ("selftests: mptcp: add explicit check for new mibs") Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 1b68fe1c08853..a7973d6a40a0d 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1683,12 +1683,12 @@ chk_add_tx_nr() timeout=$(ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout) printf "%-${nr_blank}s %s" " " "add TX" - count=$(ip netns exec $ns1 nstat -as MPTcpExtAddAddrTx | grep MPTcpExtAddAddrTx | awk '{print $2}') - [ -z "$count" ] && count=0 - + count=$(get_counter ${ns1} "MPTcpExtAddAddrTx") + if [ -z "$count" ]; then + echo -n "[skip]" # if the test configured a short timeout tolerate greater then expected # add addrs options, due to retransmissions - if [ "$count" != "$add_tx_nr" ] && { [ "$timeout" -gt 1 ] || [ "$count" -lt "$add_tx_nr" ]; }; then + elif [ "$count" != "$add_tx_nr" ] && { [ "$timeout" -gt 1 ] || [ "$count" -lt "$add_tx_nr" ]; }; then echo "[fail] got $count ADD_ADDR[s] TX, expected $add_tx_nr" fail_test else @@ -1696,9 +1696,10 @@ chk_add_tx_nr() fi echo -n " - echo TX " - count=$(ip netns exec $ns2 nstat -as MPTcpExtEchoAddTx | grep MPTcpExtEchoAddTx | awk '{print $2}') - [ -z "$count" ] && count=0 - if [ "$count" != "$echo_tx_nr" ]; then + count=$(get_counter ${ns2} "MPTcpExtEchoAddTx") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != "$echo_tx_nr" ]; then echo "[fail] got $count ADD_ADDR echo[s] TX, expected $echo_tx_nr" fail_test else @@ -1734,9 +1735,10 @@ chk_rm_nr() fi printf "%-${nr_blank}s %s" " " "rm " - count=$(ip netns exec $addr_ns nstat -as MPTcpExtRmAddr | grep MPTcpExtRmAddr | awk '{print $2}') - [ -z "$count" ] && count=0 - if [ "$count" != "$rm_addr_nr" ]; then + count=$(get_counter ${addr_ns} "MPTcpExtRmAddr") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$rm_addr_nr" ]; then echo "[fail] got $count RM_ADDR[s] expected $rm_addr_nr" fail_test else @@ -1778,16 +1780,15 @@ chk_rm_tx_nr() local rm_addr_tx_nr=$1 printf "%-${nr_blank}s %s" " " "rm TX " - count=$(ip netns exec $ns2 nstat -as MPTcpExtRmAddrTx | grep MPTcpExtRmAddrTx | awk '{print $2}') - [ -z "$count" ] && count=0 - if [ "$count" != "$rm_addr_tx_nr" ]; then + count=$(get_counter ${ns2} "MPTcpExtRmAddrTx") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != "$rm_addr_tx_nr" ]; then echo "[fail] got $count RM_ADDR[s] expected $rm_addr_tx_nr" fail_test else - echo -n "[ ok ]" + echo "[ ok ]" fi - - echo "$extra_msg" } chk_prio_nr() From bbd49d114d5790a2dc3f67926cec3fc48b2d5d81 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:21 +0200 Subject: [PATCH 8/9] mptcp: consolidate transition to TCP_CLOSE in mptcp_do_fastclose() The MPTCP code always set the msk state to TCP_CLOSE before calling performing the fast-close. Move such state transition in mptcp_do_fastclose() to avoid some code duplication. Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4ebd6e9aa949e..f65eec3e0d225 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2655,6 +2655,7 @@ static void mptcp_do_fastclose(struct sock *sk) struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); + inet_sk_state_store(sk, TCP_CLOSE); mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, MPTCP_CF_FASTCLOSE); @@ -2692,10 +2693,9 @@ static void mptcp_worker(struct work_struct *work) * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD)) { - if (mptcp_should_close(sk)) { - inet_sk_state_store(sk, TCP_CLOSE); + if (mptcp_should_close(sk)) mptcp_do_fastclose(sk); - } + if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); goto unlock; @@ -2938,7 +2938,6 @@ static void __mptcp_destroy_sock(struct sock *sk) void __mptcp_unaccepted_force_close(struct sock *sk) { sock_set_flag(sk, SOCK_DEAD); - inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); __mptcp_destroy_sock(sk); } @@ -2980,7 +2979,6 @@ bool __mptcp_close(struct sock *sk, long timeout) /* If the msk has read data, or the caller explicitly ask it, * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose */ - inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); timeout = 0; } else if (mptcp_close_state(sk)) { From 528cb5f2a1e859522f36f091f29f5c81ec6d4a4c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 20 Jun 2023 18:30:22 +0200 Subject: [PATCH 9/9] mptcp: pass addr to mptcp_pm_alloc_anno_list Pass addr parameter to mptcp_pm_alloc_anno_list() instead of entry. We can reduce the scope, e.g. in mptcp_pm_alloc_anno_list(), we only access "entry->addr", we can then restrict to the pointer to "addr" then. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 8 ++++---- net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a56718ffdd024..547e51c654801 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -341,7 +341,7 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, } bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - const struct mptcp_pm_addr_entry *entry) + const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; @@ -349,7 +349,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, lockdep_assert_held(&msk->pm.lock); - add_entry = mptcp_lookup_anno_list_by_saddr(msk, &entry->addr); + add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (add_entry) { if (mptcp_pm_is_kernel(msk)) @@ -366,7 +366,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, list_add(&add_entry->list, &msk->pm.anno_list); - add_entry->addr = entry->addr; + add_entry->addr = *addr; add_entry->sock = msk; add_entry->retrans_times = 0; @@ -576,7 +576,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) return; if (local) { - if (mptcp_pm_alloc_anno_list(msk, local)) { + if (mptcp_pm_alloc_anno_list(msk, &local->addr)) { __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &local->addr, false); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 47a883a16c11d..b5a8aa4c1ebd6 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -193,7 +193,7 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) lock_sock((struct sock *)msk); spin_lock_bh(&msk->pm.lock); - if (mptcp_pm_alloc_anno_list(msk, &addr_val)) { + if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); mptcp_pm_nl_addr_send_ack(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index bb4cacd927789..3a1a64cdeba6f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -817,7 +817,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *rem, u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - const struct mptcp_pm_addr_entry *entry); + const struct mptcp_addr_info *addr); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry *