From b713d0067574e15cddd08ee7f25acd535b7437cf Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 4 May 2022 14:54:04 -0700 Subject: [PATCH 1/5] mptcp: really share subflow snd_wnd As per RFC, mptcp subflows use a "shared" snd_wnd: the effective window is the maximum among the current values received on all subflows. Without such feature a data transfer using multiple subflows could block. Window sharing is currently implemented in the RX side: __tcp_select_window uses the mptcp-level receive buffer to compute the announced window. That is not enough: the TCP stack will stick to the window size received on the given subflow; we need to propagate the msk window value on each subflow at xmit time. Change the packet scheduler to ignore the subflow level window and use instead the msk level one Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7a9e2545884f4..0e7f556ea2c72 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1141,19 +1141,20 @@ struct mptcp_sendmsg_info { bool data_lock_held; }; -static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, - int avail_size) +static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, + u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); + u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; - if (!before64(data_seq + avail_size, window_end)) { - u64 allowed_size = window_end - data_seq; + mptcp_snd_wnd = window_end - data_seq; + avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); - return min_t(unsigned int, allowed_size, avail_size); - } + if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) + tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); return avail_size; } @@ -1305,7 +1306,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, } /* Zero window and all data acked? Probe. */ - copy = mptcp_check_allowed_size(msk, data_seq, copy); + copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); @@ -1498,11 +1499,16 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) * to check that subflow has a non empty cwin. */ ssk = send_info[SSK_MODE_ACTIVE].ssk; - if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd) + if (!ssk || !sk_stream_memory_free(ssk)) return NULL; - burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd); + burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); + if (!burst) { + msk->last_snd = NULL; + return ssk; + } + subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, From 92be2f522777b775a2d83b00c3690732c1243dfc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 4 May 2022 14:54:05 -0700 Subject: [PATCH 2/5] mptcp: add mib for xmit window sharing Bump a counter for counter when snd_wnd is shared among subflow, for observability's sake. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/protocol.c | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index d93a8c9996fd0..6a6f8151375a6 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -56,6 +56,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), + SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), SNMP_MIB_SENTINEL }; diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 529d07af9e145..2411510bef66a 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -49,6 +49,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ + MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0e7f556ea2c72..961c2aba2a110 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1153,8 +1153,10 @@ static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *s mptcp_snd_wnd = window_end - data_seq; avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); - if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) + if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); + } return avail_size; } From ea66758c1795cef81dc59cd5240e9d0f5c5207cf Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 4 May 2022 14:54:06 -0700 Subject: [PATCH 3/5] tcp: allow MPTCP to update the announced window The MPTCP RFC requires that the MPTCP-level receive window's right edge never moves backward. Currently the MPTCP code enforces such constraint while tracking the right edge, but it does not reflects it on the wire, as MPTCP lacks a suitable hook to update accordingly the TCP header. This change modifies the existing mptcp_write_options() hook, providing the current packet's TCP header to the MPTCP protocol, so that the next patch could implement the above mentioned constraint. No functional changes intended. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 2 +- net/ipv4/tcp_output.c | 14 ++++++++------ net/mptcp/options.c | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 8b1afd6f5cc46..d4ec894ce67b0 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -125,7 +125,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts); bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); -void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, +void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts); void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5f91a9536e006..b092228e43426 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -445,12 +445,13 @@ struct tcp_out_options { struct mptcp_out_options mptcp; }; -static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp, +static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, + struct tcp_sock *tp, struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) - mptcp_write_options(ptr, tp, &opts->mptcp); + mptcp_write_options(th, ptr, tp, &opts->mptcp); #endif } @@ -606,9 +607,10 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, * At least SACK_PERM as the first option is known to lead to a disaster * (but it may well be that other scenarios fail similarly). */ -static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, +static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts) { + __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ if (unlikely(OPTION_MD5 & options)) { @@ -702,7 +704,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, smc_options_write(ptr, &options); - mptcp_options_write(ptr, tp, opts); + mptcp_options_write(th, ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, @@ -1355,7 +1357,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, th->window = htons(min(tp->rcv_wnd, 65535U)); } - tcp_options_write((__be32 *)(th + 1), tp, &opts); + tcp_options_write(th, tp, &opts); #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ @@ -3591,7 +3593,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), NULL, &opts); + tcp_options_write(th, NULL, &opts); th->doff = (tcp_header_size >> 2); __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index e05d9458a0250..2570911735ab5 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1265,7 +1265,7 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext) ~csum_unfold(mpext->csum)); } -void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, +void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts) { const struct sock *ssk = (const struct sock *)tp; From f3589be0c420a3137e5902d15705ced6a36f3f43 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 4 May 2022 14:54:07 -0700 Subject: [PATCH 4/5] mptcp: never shrink offered window As per RFC, the offered MPTCP-level window should never shrink. While we currently track the right edge, we don't enforce the above constraint on the wire. Additionally, concurrent xmit on different subflows can end-up in erroneous right edge update. Address the above explicitly updating the announced window and protecting the update with an additional atomic operation (sic) Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 52 ++++++++++++++++++++++++++++++++++++++------ net/mptcp/protocol.c | 8 +++---- net/mptcp/protocol.h | 2 +- 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 2570911735ab5..3e3156cfe813d 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1224,20 +1224,58 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return true; } -static void mptcp_set_rwin(const struct tcp_sock *tp) +static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; - const struct mptcp_subflow_context *subflow; + struct mptcp_subflow_context *subflow; + u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; - u64 ack_seq; + u32 new_win; + u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); - ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; + ack_seq = READ_ONCE(msk->ack_seq); + rcv_wnd_new = ack_seq + tp->rcv_wnd; + + rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); + if (after64(rcv_wnd_new, rcv_wnd_old)) { + u64 rcv_wnd; - if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); + for (;;) { + rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); + + if (rcv_wnd == rcv_wnd_old) + break; + if (before64(rcv_wnd_new, rcv_wnd)) + goto raise_win; + rcv_wnd_old = rcv_wnd; + } + return; + } + + if (rcv_wnd_new != rcv_wnd_old) { +raise_win: + win = rcv_wnd_old - ack_seq; + tp->rcv_wnd = min_t(u64, win, U32_MAX); + new_win = tp->rcv_wnd; + + /* Make sure we do not exceed the maximum possible + * scaled window. + */ + if (unlikely(th->syn)) + new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; + if (!tp->rx_opt.rcv_wscale && + sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + + /* RFC1323 scaling applied */ + new_win >>= tp->rx_opt.rcv_wscale; + th->window = htons(new_win); + } } u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) @@ -1554,7 +1592,7 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, } if (tp) - mptcp_set_rwin(tp); + mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 961c2aba2a110..9e46cc89a8f76 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -216,7 +216,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; - max_seq = READ_ONCE(msk->rcv_wnd_sent); + max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); @@ -225,7 +225,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, - (unsigned long long)msk->rcv_wnd_sent); + (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -3004,7 +3004,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); } sock_reset_flag(nsk, SOCK_RCU_FREE); @@ -3297,9 +3297,9 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->snd_una, msk->write_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); mptcp_pm_new_connection(msk, ssk, 0); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f542aeaa5b09d..4672901d0dfed 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -257,7 +257,7 @@ struct mptcp_sock { u64 write_seq; u64 snd_nxt; u64 ack_seq; - u64 rcv_wnd_sent; + atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; int rmem_fwd_alloc; struct sock *last_snd; From 38acb6260f60a7698c3a24db4df6ec1cf8f14c60 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 4 May 2022 14:54:08 -0700 Subject: [PATCH 5/5] mptcp: add more offered MIBs counter Track the exceptional handling of MPTCP-level offered window with a few more counters for observability. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 3 +++ net/mptcp/mib.h | 5 +++++ net/mptcp/options.c | 6 +++++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 6a6f8151375a6..0dac2863c6e1e 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -57,6 +57,9 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), + SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED), + SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), + SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_SENTINEL }; diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 2411510bef66a..2be3596374f4e 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -50,6 +50,11 @@ enum linux_mptcp_mib_field { MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */ + MPTCP_MIB_RCVWNDSHARED, /* Subflow rcv wnd is overridden by msk's one */ + MPTCP_MIB_RCVWNDCONFLICTUPDATE, /* subflow rcv wnd is overridden by msk's one due to + * conflict with another subflow while updating msk rcv wnd + */ + MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 3e3156cfe813d..ac3b7b8a02f6e 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1248,8 +1248,11 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) if (rcv_wnd == rcv_wnd_old) break; - if (before64(rcv_wnd_new, rcv_wnd)) + if (before64(rcv_wnd_new, rcv_wnd)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); goto raise_win; + } + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); rcv_wnd_old = rcv_wnd; } return; @@ -1275,6 +1278,7 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; th->window = htons(new_win); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); } }