diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 3b352e5f6300c..ee06fd782465a 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -24,3 +24,11 @@ add_addr_timeout - INTEGER (seconds) sysctl. Default: 120 + +checksum_enabled - BOOLEAN + Control whether DSS checksum can be enabled. + + DSS checksum can be enabled if the value is nonzero. This is a + per-namespace sysctl. + + Default: 0 diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 83f23774b9084..d61bbbf119792 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -23,6 +23,7 @@ struct mptcp_ext { u64 data_seq; u32 subflow_seq; u16 data_len; + __sum16 csum; u8 use_map:1, dsn64:1, data_fin:1, @@ -31,7 +32,8 @@ struct mptcp_ext { mpc_map:1, frozen:1, reset_transient:1; - u8 reset_reason:4; + u8 reset_reason:4, + csum_reqd:1; }; #define MPTCP_RM_IDS_MAX 8 @@ -63,8 +65,9 @@ struct mptcp_out_options { struct mptcp_rm_list rm_list; u8 join_id; u8 backup; - u8 reset_reason:4; - u8 reset_transient:1; + u8 reset_reason:4, + reset_transient:1, + csum_reqd:1; u32 nonce; u64 thmac; u32 token; diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h index 775a46d0b0f07..6bf43176f14c1 100644 --- a/include/trace/events/mptcp.h +++ b/include/trace/events/mptcp.h @@ -73,6 +73,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext, __field(u64, data_seq) __field(u32, subflow_seq) __field(u16, data_len) + __field(u16, csum) __field(u8, use_map) __field(u8, dsn64) __field(u8, data_fin) @@ -82,6 +83,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext, __field(u8, frozen) __field(u8, reset_transient) __field(u8, reset_reason) + __field(u8, csum_reqd) ), TP_fast_assign( @@ -89,6 +91,7 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext, __entry->data_seq = mpext->data_seq; __entry->subflow_seq = mpext->subflow_seq; __entry->data_len = mpext->data_len; + __entry->csum = (__force u16)mpext->csum; __entry->use_map = mpext->use_map; __entry->dsn64 = mpext->dsn64; __entry->data_fin = mpext->data_fin; @@ -98,16 +101,18 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext, __entry->frozen = mpext->frozen; __entry->reset_transient = mpext->reset_transient; __entry->reset_reason = mpext->reset_reason; + __entry->csum_reqd = mpext->csum_reqd; ), - TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u", + TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u csum=%x use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u csum_reqd=%u", __entry->data_ack, __entry->data_seq, __entry->subflow_seq, __entry->data_len, - __entry->use_map, __entry->dsn64, - __entry->data_fin, __entry->use_ack, - __entry->ack64, __entry->mpc_map, - __entry->frozen, __entry->reset_transient, - __entry->reset_reason) + __entry->csum, __entry->use_map, + __entry->dsn64, __entry->data_fin, + __entry->use_ack, __entry->ack64, + __entry->mpc_map, __entry->frozen, + __entry->reset_transient, __entry->reset_reason, + __entry->csum_reqd) ); DEFINE_EVENT(mptcp_dump_mpext, get_mapping_status, diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 8eb3c0844bfff..7b05f7102321a 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -105,6 +105,7 @@ struct mptcp_info { __u64 mptcpi_rcv_nxt; __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; + __u8 mptcpi_csum_enabled; }; /* diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 1ec4d36a39f05..6c2639bb9c19f 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -23,6 +23,7 @@ struct mptcp_pernet { u8 mptcp_enabled; unsigned int add_addr_timeout; + u8 checksum_enabled; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -40,10 +41,16 @@ unsigned int mptcp_get_add_addr_timeout(struct net *net) return mptcp_get_pernet(net)->add_addr_timeout; } +int mptcp_is_checksum_enabled(struct net *net) +{ + return mptcp_get_pernet(net)->checksum_enabled; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; + pernet->checksum_enabled = 0; } #ifdef CONFIG_SYSCTL @@ -65,6 +72,14 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "checksum_enabled", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, {} }; @@ -82,6 +97,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; + table[2].data = &pernet->checksum_enabled; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index eb2dc6dbe212b..e7e60bc1fb96c 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -25,6 +25,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index f0da4f060fe1a..92e56c0cfbdd8 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -18,6 +18,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */ diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f16d9b5ee978e..8f88ddeab6a2e 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -144,6 +144,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_write_seq = READ_ONCE(msk->write_seq); info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); unlock_sock_fast(sk, slow); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 6b825fb3fa832..1aec01686c1ad 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -44,7 +44,20 @@ static void mptcp_parse_option(const struct sk_buff *skb, else expected_opsize = TCPOLEN_MPTCP_MPC_SYN; } - if (opsize != expected_opsize) + + /* Cfr RFC 8684 Section 3.3.0: + * If a checksum is present but its use had + * not been negotiated in the MP_CAPABLE handshake, the receiver MUST + * close the subflow with a RST, as it is not behaving as negotiated. + * If a checksum is not present when its use has been negotiated, the + * receiver MUST close the subflow with a RST, as it is considered + * broken + * We parse even option with mismatching csum presence, so that + * later in subflow_data_ready we can trigger the reset. + */ + if (opsize != expected_opsize && + (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || + opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ @@ -66,16 +79,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." - * - * Section 3.3.0: - * "If a checksum is not present when its use has been - * negotiated, the receiver MUST close the subflow with a RST as - * it is considered broken." - * - * We don't implement DSS checksum - fall back to TCP. */ if (flags & MPTCP_CAP_CHECKSUM_REQD) - break; + mp_opt->csum_reqd = 1; mp_opt->mp_capable = 1; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { @@ -86,7 +92,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } - if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used @@ -98,9 +104,14 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } - pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum_reqd = 1; + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", version, flags, opsize, mp_opt->sndr_key, - mp_opt->rcvr_key, mp_opt->data_len); + mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: @@ -171,10 +182,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } - /* RFC 6824, Section 3.3: - * If a checksum is present, but its use had - * not been negotiated in the MP_CAPABLE handshake, - * the checksum field MUST be ignored. + /* Always parse any csum presence combination, we will enforce + * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) @@ -209,9 +218,15 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; - pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { + mp_opt->csum_reqd = 1; + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + ptr += 2; + } + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", mp_opt->data_seq, mp_opt->subflow_seq, - mp_opt->data_len); + mp_opt->data_len, mp_opt->csum_reqd, mp_opt->csum); } break; @@ -323,9 +338,12 @@ static void mptcp_parse_option(const struct sk_buff *skb, } } -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; @@ -341,6 +359,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->dss = 0; mp_opt->mp_prio = 0; mp_opt->reset = 0; + mp_opt->csum_reqd = READ_ONCE(msk->csum_enabled); length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -380,6 +399,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { @@ -435,8 +455,10 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; + u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than @@ -465,16 +487,26 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ - if (data_len > 0) - *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); - else + if (data_len > 0) { + len = TCPOLEN_MPTCP_MPC_ACK_DATA; + if (opts->csum_reqd) { + /* we need to propagate more info to csum the pseudo hdr */ + opts->ext_copy.data_seq = mpext->data_seq; + opts->ext_copy.subflow_seq = mpext->subflow_seq; + opts->ext_copy.csum = mpext->csum; + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } + *size = ALIGN(len, 4); + } else { *size = TCPOLEN_MPTCP_MPC_ACK; + } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", subflow, subflow->local_key, subflow->remote_key, @@ -535,18 +567,21 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; u64 ack_seq; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { - unsigned int map_size; + unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; - map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; + if (mpext) { + if (opts->csum_reqd) + map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; - remaining -= map_size; - dss_size = map_size; - if (mpext) opts->ext_copy = *mpext; + } + remaining -= map_size; + dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); ret = true; @@ -789,6 +824,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; + opts->csum_reqd = subflow_req->csum_reqd; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); @@ -1007,7 +1043,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return; } - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; @@ -1099,6 +1135,10 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; + mpext->csum_reqd = mp_opt.csum_reqd; + + if (mpext->csum_reqd) + mpext->csum = mp_opt.csum; } } @@ -1118,25 +1158,50 @@ static void mptcp_set_rwin(const struct tcp_sock *tp) WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } +static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +{ + struct csum_pseudo_header header; + __wsum csum; + + /* cfr RFC 8684 3.3.1.: + * the data sequence number used in the pseudo-header is + * always the 64-bit value, irrespective of what length is used in the + * DSS option itself. + */ + header.data_seq = cpu_to_be64(mpext->data_seq); + header.subflow_seq = htonl(mpext->subflow_seq); + header.data_len = htons(mpext->data_len); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum)); + return (__force u16)csum_fold(csum); +} + void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { - u8 len; + u8 len, flag = MPTCP_CAP_HMAC_SHA256; - if (OPTION_MPTCP_MPC_SYN & opts->suboptions) + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; - else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; - else if (opts->ext_copy.data_len) + } else if (opts->ext_copy.data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; - else + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } else { len = TCPOLEN_MPTCP_MPC_ACK; + } + + if (opts->csum_reqd) + flag |= MPTCP_CAP_CHECKSUM_REQD; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, - MPTCP_CAP_HMAC_SHA256); + flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) @@ -1152,8 +1217,13 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, if (!opts->ext_copy.data_len) goto mp_capable_done; - put_unaligned_be32(opts->ext_copy.data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + if (opts->csum_reqd) { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + mptcp_make_csum(&opts->ext_copy), ptr); + } else { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } ptr += 1; } @@ -1305,6 +1375,9 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; + + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); @@ -1324,8 +1397,13 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; - put_unaligned_be32(mpext->data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + if (opts->csum_reqd) { + put_unaligned_be32(mpext->data_len << 16 | + mptcp_make_csum(mpext), ptr); + } else { + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9930950899900..42fc7187beeef 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1308,6 +1308,18 @@ static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); } +/* note: this always recompute the csum on the whole skb, even + * if we just appended a single frag. More status info needed + */ +static void mptcp_update_data_checksum(struct sk_buff *skb, int added) +{ + struct mptcp_ext *mpext = mptcp_get_ext(skb); + __wsum csum = ~csum_unfold(mpext->csum); + int offset = skb->len - added; + + mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1402,10 +1414,14 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (zero_window_probe) { mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mpext->frozen = 1; - ret = 0; + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); tcp_push_pending_frames(ssk); + return 0; } out: + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; } @@ -2359,8 +2375,8 @@ static void __mptcp_retrans(struct sock *sk) /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; - info.limit = dfrag->already_sent; - while (info.sent < dfrag->already_sent) { + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; + while (info.sent < info.limit) { if (!mptcp_alloc_tx_skb(sk, ssk)) break; @@ -2372,9 +2388,11 @@ static void __mptcp_retrans(struct sock *sk) copied += ret; info.sent += ret; } - if (copied) + if (copied) { + dfrag->already_sent = max(dfrag->already_sent, info.sent); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + } mptcp_set_timeout(sk, ssk); release_sock(ssk); @@ -2453,6 +2471,7 @@ static int __mptcp_init_sock(struct sock *sk) msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_init(msk); @@ -2793,6 +2812,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->token = subflow_req->token; msk->subflow = NULL; WRITE_ONCE(msk->fully_established, false); + if (mp_opt->csum_reqd) + WRITE_ONCE(msk->csum_enabled, true); msk->write_seq = subflow_req->idsn + 1; msk->snd_nxt = msk->write_seq; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 89f6b73783d5c..16e50caf200e5 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -68,6 +68,8 @@ #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 +#define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) + /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) #define MPTCPOPT_HMAC_LEN 20 @@ -124,6 +126,7 @@ struct mptcp_options_received { u64 data_seq; u32 subflow_seq; u16 data_len; + __sum16 csum; u16 mp_capable : 1, mp_join : 1, fastclose : 1, @@ -133,6 +136,7 @@ struct mptcp_options_received { rm_addr : 1, mp_prio : 1, echo : 1, + csum_reqd : 1, backup : 1; u32 token; u32 nonce; @@ -234,6 +238,7 @@ struct mptcp_sock { bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ + bool csum_enabled; spinlock_t join_list_lock; struct sock *ack_hint; struct work_struct work; @@ -335,11 +340,19 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } +struct csum_pseudo_header { + __be64 data_seq; + __be32 subflow_seq; + __be16 data_len; + __sum16 csum; +}; + struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, - backup : 1; + backup : 1, + csum_reqd : 1; u8 local_id; u8 remote_id; u64 local_key; @@ -387,6 +400,8 @@ struct mptcp_subflow_context { u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; + __wsum map_data_csum; + u32 map_csum_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, @@ -396,6 +411,8 @@ struct mptcp_subflow_context { pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, + map_csum_reqd : 1, + map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, @@ -525,6 +542,7 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su int mptcp_is_enabled(struct net *net); unsigned int mptcp_get_add_addr_timeout(struct net *net); +int mptcp_is_checksum_enabled(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); @@ -576,7 +594,8 @@ int __init mptcp_proto_v6_init(void); struct sock *mptcp_sk_clone(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct request_sock *req); -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 33956337c46bc..6b1cd4257edf7 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -108,6 +108,7 @@ static void subflow_init_req(struct request_sock *req, const struct sock *sk_lis subflow_req->mp_capable = 0; subflow_req->mp_join = 0; + subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } @@ -150,7 +151,7 @@ static int subflow_check_req(struct request_sock *req, return -EINVAL; #endif - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); if (mp_opt.mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -247,7 +248,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, int err; subflow_init_req(req, sk_listener); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); if (mp_opt.mp_capable && mp_opt.mp_join) return -EINVAL; @@ -394,7 +395,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (subflow->request_mptcp) { if (!mp_opt.mp_capable) { MPTCP_INC_STATS(sock_net(sk), @@ -404,6 +405,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto fallback; } + if (mp_opt.csum_reqd) + WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; @@ -638,7 +641,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * reordered MPC will cause fallback, but we don't have other * options. */ - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!mp_opt.mp_capable) { fallback = true; goto create_child; @@ -648,7 +651,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); @@ -824,10 +827,92 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) return true; } +static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, + bool csum_reqd) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct csum_pseudo_header header; + u32 offset, seq, delta; + __wsum csum; + int len; + + if (!csum_reqd) + return MAPPING_OK; + + /* mapping already validated on previous traversal */ + if (subflow->map_csum_len == subflow->map_data_len) + return MAPPING_OK; + + /* traverse the receive queue, ensuring it contains a full + * DSS mapping and accumulating the related csum. + * Preserve the accoumlate csum across multiple calls, to compute + * the csum only once + */ + delta = subflow->map_data_len - subflow->map_csum_len; + for (;;) { + seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; + offset = seq - TCP_SKB_CB(skb)->seq; + + /* if the current skb has not been accounted yet, csum its contents + * up to the amount covered by the current DSS + */ + if (offset < skb->len) { + __wsum csum; + + len = min(skb->len - offset, delta); + csum = skb_checksum(skb, offset, len, 0); + subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, + subflow->map_csum_len); + + delta -= len; + subflow->map_csum_len += len; + } + if (delta == 0) + break; + + if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { + /* if this subflow is closed, the partial mapping + * will be never completed; flush the pending skbs, so + * that subflow_sched_work_if_closed() can kick in + */ + if (unlikely(ssk->sk_state == TCP_CLOSE)) + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); + + /* not enough data to validate the csum */ + return MAPPING_EMPTY; + } + + /* the DSS mapping for next skbs will be validated later, + * when a get_mapping_status call will process such skb + */ + skb = skb->next; + } + + /* note that 'map_data_len' accounts only for the carried data, does + * not include the eventual seq increment due to the data fin, + * while the pseudo header requires the original DSS data len, + * including that + */ + header.data_seq = cpu_to_be64(subflow->map_seq); + header.subflow_seq = htonl(subflow->map_subflow_seq); + header.data_len = htons(subflow->map_data_len + subflow->map_data_fin); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), subflow->map_data_csum); + if (unlikely(csum_fold(csum))) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); + return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + } + + return MAPPING_OK; +} + static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -920,9 +1005,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk, /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && - subflow->map_data_len == data_len) { + subflow->map_data_len == data_len && + subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + goto validate_csum; } /* If this skb data are fully covered by the current mapping, @@ -934,17 +1020,27 @@ static enum mapping_status get_mapping_status(struct sock *ssk, } /* will validate the next map after consuming the current one */ - return MAPPING_OK; + goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; + subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; - pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_csum_reqd = mpext->csum_reqd; + subflow->map_csum_len = 0; + subflow->map_data_csum = csum_unfold(mpext->csum); + + /* Cfr RFC 8684 Section 3.3.0 */ + if (unlikely(subflow->map_csum_reqd != csum_reqd)) + return MAPPING_INVALID; + + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", subflow->map_seq, subflow->map_subflow_seq, - subflow->map_data_len); + subflow->map_data_len, subflow->map_csum_reqd, + subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure @@ -954,7 +1050,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk, return MAPPING_INVALID; skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + +validate_csum: + return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 9ca5f1ba461ec..69351c3eb68c5 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -3,7 +3,7 @@ time_start=$(date +%s) -optstring="S:R:d:e:l:r:h4cm:f:t" +optstring="S:R:d:e:l:r:h4cm:f:tC" ret=0 sin="" sout="" @@ -22,6 +22,7 @@ sndbuf=0 rcvbuf=0 options_log=true do_tcp=0 +checksum=false filesize=0 if [ $tc_loss -eq 100 ];then @@ -47,6 +48,7 @@ usage() { echo -e "\t-R: set rcvbuf value (default: use kernel default)" echo -e "\t-m: test mode (poll, sendfile; default: poll)" echo -e "\t-t: also run tests with TCP (use twice to non-fallback tcp)" + echo -e "\t-C: enable the MPTCP data checksum" } while getopts "$optstring" option;do @@ -104,6 +106,9 @@ while getopts "$optstring" option;do "t") do_tcp=$((do_tcp+1)) ;; + "C") + checksum=true + ;; "?") usage $0 exit 1 @@ -200,6 +205,12 @@ ip -net "$ns4" route add default via dead:beef:3::2 # use TCP syn cookies, even if no flooding was detected. ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2 +if $checksum; then + for i in "$ns1" "$ns2" "$ns3" "$ns4";do + ip netns exec $i sysctl -q net.mptcp.checksum_enabled=1 + done +fi + set_ethtool_flags() { local ns="$1" local dev="$2" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index fd99485cf2a4a..523c7797f30ac 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -12,6 +12,7 @@ timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) mptcp_connect="" capture=0 +checksum=0 do_all_tests=1 TEST_COUNT=0 @@ -49,6 +50,9 @@ init() ip netns exec $netns sysctl -q net.mptcp.enabled=1 ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0 ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0 + if [ $checksum -eq 1 ]; then + ip netns exec $netns sysctl -q net.mptcp.checksum_enabled=1 + fi done # ns1 ns2 @@ -124,6 +128,17 @@ reset_with_add_addr_timeout() -j DROP } +reset_with_checksum() +{ + local ns1_enable=$1 + local ns2_enable=$2 + + reset + + ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=$ns1_enable + ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=$ns2_enable +} + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" @@ -476,6 +491,45 @@ run_tests() fi } +chk_csum_nr() +{ + local msg=${1:-""} + local count + local dump_stats + + if [ ! -z "$msg" ]; then + printf "%02u" "$TEST_COUNT" + else + echo -n " " + fi + printf " %-36s %s" "$msg" "sum" + count=`ip netns exec $ns1 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != 0 ]; then + echo "[fail] got $count data checksum error[s] expected 0" + ret=1 + dump_stats=1 + else + echo -n "[ ok ]" + fi + echo -n " - csum " + count=`ip netns exec $ns2 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != 0 ]; then + echo "[fail] got $count data checksum error[s] expected 0" + ret=1 + dump_stats=1 + else + echo "[ ok ]" + fi + if [ "${dump_stats}" = 1 ]; then + echo Server ns stats + ip netns exec $ns1 nstat -as | grep MPTcp + echo Client ns stats + ip netns exec $ns2 nstat -as | grep MPTcp + fi +} + chk_join_nr() { local msg="$1" @@ -523,6 +577,9 @@ chk_join_nr() echo Client ns stats ip netns exec $ns2 nstat -as | grep MPTcp fi + if [ $checksum -eq 1 ]; then + chk_csum_nr + fi } chk_add_nr() @@ -1374,6 +1431,37 @@ syncookies_tests() chk_add_nr 1 1 } +checksum_tests() +{ + # checksum test 0 0 + reset_with_checksum 0 0 + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_csum_nr "checksum test 0 0" + + # checksum test 1 1 + reset_with_checksum 1 1 + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_csum_nr "checksum test 1 1" + + # checksum test 0 1 + reset_with_checksum 0 1 + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_csum_nr "checksum test 0 1" + + # checksum test 1 0 + reset_with_checksum 1 0 + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_csum_nr "checksum test 1 0" +} + all_tests() { subflows_tests @@ -1387,6 +1475,7 @@ all_tests() backup_tests add_addr_ports_tests syncookies_tests + checksum_tests } usage() @@ -1403,7 +1492,9 @@ usage() echo " -b backup_tests" echo " -p add_addr_ports_tests" echo " -k syncookies_tests" + echo " -S checksum_tests" echo " -c capture pcap files" + echo " -C enable data checksum" echo " -h help" } @@ -1418,13 +1509,16 @@ make_file "$sin" "server" 1 trap cleanup EXIT for arg in "$@"; do - # check for "capture" arg before launching tests + # check for "capture/checksum" args before launching tests if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then capture=1 fi + if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"C"[0-9a-zA-Z]*$ ]]; then + checksum=1 + fi - # exception for the capture option, the rest means: a part of the tests - if [ "${arg}" != "-c" ]; then + # exception for the capture/checksum options, the rest means: a part of the tests + if [ "${arg}" != "-c" ] && [ "${arg}" != "-C" ]; then do_all_tests=0 fi done @@ -1434,7 +1528,7 @@ if [ $do_all_tests -eq 1 ]; then exit $ret fi -while getopts 'fsltra64bpkch' opt; do +while getopts 'fsltra64bpkchCS' opt; do case $opt in f) subflows_tests @@ -1469,8 +1563,13 @@ while getopts 'fsltra64bpkch' opt; do k) syncookies_tests ;; + S) + checksum_tests + ;; c) ;; + C) + ;; h | *) usage ;;