diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 0a3b0fb04a3b9..8b1afd6f5cc46 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -35,7 +35,8 @@ struct mptcp_ext { frozen:1, reset_transient:1; u8 reset_reason:4, - csum_reqd:1; + csum_reqd:1, + infinite_map:1; }; #define MPTCP_RM_IDS_MAX 8 diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h index f8e28e686c65f..563e48617374d 100644 --- a/include/trace/events/mptcp.h +++ b/include/trace/events/mptcp.h @@ -84,6 +84,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext, __field(u8, reset_transient) __field(u8, reset_reason) __field(u8, csum_reqd) + __field(u8, infinite_map) ), TP_fast_assign( @@ -102,9 +103,10 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext, __entry->reset_transient = mpext->reset_transient; __entry->reset_reason = mpext->reset_reason; __entry->csum_reqd = mpext->csum_reqd; + __entry->infinite_map = mpext->infinite_map; ), - TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u csum=%x use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u csum_reqd=%u", + TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u csum=%x use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u csum_reqd=%u infinite_map=%u", __entry->data_ack, __entry->data_seq, __entry->subflow_seq, __entry->data_len, __entry->csum, __entry->use_map, @@ -112,7 +114,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext, __entry->use_ack, __entry->ack64, __entry->mpc_map, __entry->frozen, __entry->reset_transient, __entry->reset_reason, - __entry->csum_reqd) + __entry->csum_reqd, __entry->infinite_map) ); DEFINE_EVENT(mptcp_dump_mpext, mptcp_sendmsg_frag, diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index e55d3dfbee0c3..d93a8c9996fd0 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -24,6 +24,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), + SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 00576179a619f..529d07af9e145 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -17,6 +17,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ + MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 325383646f5c0..88f4ebbd6515b 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -825,7 +825,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; - if (unlikely(__mptcp_check_fallback(msk))) + if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) return false; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { @@ -1340,8 +1340,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { + /* data_len == 0 is reserved for the infinite mapping, + * the checksum will also be set to 0. + */ put_unaligned_be32(mpext->data_len << 16 | - mptcp_make_csum(mpext), ptr); + (mpext->data_len ? mptcp_make_csum(mpext) : 0), + ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 8aa0cdb7ad46a..5c36870d3420b 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -285,7 +285,13 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + pr_debug("fail_seq=%llu", fail_seq); + + if (!mptcp_has_another_subflow(sk) && READ_ONCE(msk->allow_infinite_fallback)) + subflow->send_infinite_map = 1; } /* path manager helpers */ diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0492aa9308c77..4581c570ef682 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1229,6 +1229,22 @@ static void mptcp_update_data_checksum(struct sk_buff *skb, int added) mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } +static void mptcp_update_infinite_map(struct mptcp_sock *msk, + struct sock *ssk, + struct mptcp_ext *mpext) +{ + if (!mpext) + return; + + mpext->infinite_map = 1; + mpext->data_len = 0; + + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); + mptcp_subflow_ctx(ssk)->send_infinite_map = 0; + pr_fallback(msk); + __mptcp_do_fallback(msk); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1360,6 +1376,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); + if (mptcp_subflow_ctx(ssk)->send_infinite_map) + mptcp_update_infinite_map(msk, ssk, mpext); trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; @@ -2465,6 +2483,7 @@ static void __mptcp_retrans(struct sock *sk) dfrag->already_sent = max(dfrag->already_sent, info.sent); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); } release_sock(ssk); @@ -2539,6 +2558,7 @@ static int __mptcp_init_sock(struct sock *sk) msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); + WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; mptcp_pm_data_init(msk); @@ -3275,6 +3295,7 @@ bool mptcp_finish_join(struct sock *ssk) } subflow->map_seq = READ_ONCE(msk->ack_seq); + WRITE_ONCE(msk->allow_infinite_fallback, false); out: mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index aca1fb56523f6..61d600693ffd0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -263,6 +263,7 @@ struct mptcp_sock { bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; + bool allow_infinite_fallback; u8 recvmsg_inq:1, cork:1, nodelay:1; @@ -440,6 +441,7 @@ struct mptcp_subflow_context { send_mp_prio : 1, send_mp_fail : 1, send_fastclose : 1, + send_infinite_map : 1, rx_eof : 1, can_ack : 1, /* only after processing the remote a key */ disposable : 1, /* ctx can be free at ulp release time */ @@ -876,6 +878,17 @@ static inline void mptcp_do_fallback(struct sock *sk) #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) +static inline bool mptcp_check_infinite_map(struct sk_buff *skb) +{ + struct mptcp_ext *mpext; + + mpext = skb ? mptcp_get_ext(skb) : NULL; + if (mpext && mpext->infinite_map) + return true; + + return false; +} + static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index aba260f547daa..30ffb00661bb9 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1006,7 +1006,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len = mpext->data_len; if (data_len == 0) { + pr_debug("infinite mapping received"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); + subflow->map_data_len = 0; return MAPPING_INVALID; } @@ -1203,35 +1205,39 @@ static bool subflow_check_data_avail(struct sock *ssk) return false; fallback: - /* RFC 8684 section 3.7. */ - if (subflow->send_mp_fail) { - if (mptcp_has_another_subflow(ssk)) { - while ((skb = skb_peek(&ssk->sk_receive_queue))) - sk_eat_skb(ssk, skb); + if (!__mptcp_check_fallback(msk)) { + /* RFC 8684 section 3.7. */ + if (subflow->send_mp_fail) { + if (mptcp_has_another_subflow(ssk) || + !READ_ONCE(msk->allow_infinite_fallback)) { + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; + tcp_send_active_reset(ssk, GFP_ATOMIC); + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); + } + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + return true; } - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); - subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; - tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); - return true; - } - if (subflow->mp_join || subflow->fully_established) { - /* fatal protocol error, close the socket. - * subflow_error_report() will introduce the appropriate barriers - */ - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); - subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMPTCP; - tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); - return false; + if ((subflow->mp_join || subflow->fully_established) && subflow->map_data_len) { + /* fatal protocol error, close the socket. + * subflow_error_report() will introduce the appropriate barriers + */ + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMPTCP; + tcp_send_active_reset(ssk, GFP_ATOMIC); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + return false; + } + + __mptcp_do_fallback(msk); } - __mptcp_do_fallback(msk); skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; subflow->map_seq = READ_ONCE(msk->ack_seq); @@ -1483,6 +1489,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); + WRITE_ONCE(msk->allow_infinite_fallback, false); return err; failed_unlink: diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 7314257d248a7..9eb4d889a24a1 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1106,6 +1106,38 @@ chk_rst_nr() echo "$extra_msg" } +chk_infi_nr() +{ + local infi_tx=$1 + local infi_rx=$2 + local count + local dump_stats + + printf "%-${nr_blank}s %s" " " "itx" + count=$(ip netns exec $ns2 nstat -as | grep InfiniteMapTx | awk '{print $2}') + [ -z "$count" ] && count=0 + if [ "$count" != "$infi_tx" ]; then + echo "[fail] got $count infinite map[s] TX expected $infi_tx" + fail_test + dump_stats=1 + else + echo -n "[ ok ]" + fi + + echo -n " - infirx" + count=$(ip netns exec $ns1 nstat -as | grep InfiniteMapRx | awk '{print $2}') + [ -z "$count" ] && count=0 + if [ "$count" != "$infi_rx" ]; then + echo "[fail] got $count infinite map[s] RX expected $infi_rx" + fail_test + dump_stats=1 + else + echo "[ ok ]" + fi + + [ "${dump_stats}" = 1 ] && dump_stats +} + chk_join_nr() { local syn_nr=$1 @@ -1115,7 +1147,8 @@ chk_join_nr() local csum_ns2=${5:-0} local fail_nr=${6:-0} local rst_nr=${7:-0} - local corrupted_pkts=${8:-0} + local infi_nr=${8:-0} + local corrupted_pkts=${9:-0} local count local dump_stats local with_cookie @@ -1170,6 +1203,7 @@ chk_join_nr() chk_csum_nr $csum_ns1 $csum_ns2 chk_fail_nr $fail_nr $fail_nr chk_rst_nr $rst_nr $rst_nr + chk_infi_nr $infi_nr $infi_nr fi }