From 8ce568ed06ce4ca38c0b67d8de9b8d75b731f90a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:33 -0700 Subject: [PATCH 1/6] mptcp: drop tx skb cache The mentioned cache was introduced to reduce the number of skb allocation in atomic context, but the required complexity is excessive. This change remove the mentioned cache. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 89 ++------------------------------------------ net/mptcp/protocol.h | 2 - 2 files changed, 4 insertions(+), 87 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b5f2f504b85bb..77c90d6f04df5 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -902,22 +902,14 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static int mptcp_wmem_with_overhead(struct sock *sk, int size) +static int mptcp_wmem_with_overhead(int size) { - struct mptcp_sock *msk = mptcp_sk(sk); - int ret, skbs; - - ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); - skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; - if (skbs < msk->skb_tx_cache.qlen) - return ret; - - return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); + return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); } static void __mptcp_wmem_reserve(struct sock *sk, int size) { - int amount = mptcp_wmem_with_overhead(sk, size); + int amount = mptcp_wmem_with_overhead(size); struct mptcp_sock *msk = mptcp_sk(sk); WARN_ON_ONCE(msk->wmem_reserved); @@ -1212,49 +1204,8 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) return NULL; } -static bool mptcp_tx_cache_refill(struct sock *sk, int size, - struct sk_buff_head *skbs, int *total_ts) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sk_buff *skb; - int space_needed; - - if (unlikely(tcp_under_memory_pressure(sk))) { - mptcp_mem_reclaim_partial(sk); - - /* under pressure pre-allocate at most a single skb */ - if (msk->skb_tx_cache.qlen) - return true; - space_needed = msk->size_goal_cache; - } else { - space_needed = msk->tx_pending_data + size - - msk->skb_tx_cache.qlen * msk->size_goal_cache; - } - - while (space_needed > 0) { - skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); - if (unlikely(!skb)) { - /* under memory pressure, try to pass the caller a - * single skb to allow forward progress - */ - while (skbs->qlen > 1) { - skb = __skb_dequeue_tail(skbs); - *total_ts -= skb->truesize; - __kfree_skb(skb); - } - return skbs->qlen > 0; - } - - *total_ts += skb->truesize; - __skb_queue_tail(skbs, skb); - space_needed -= msk->size_goal_cache; - } - return true; -} - static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { - struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb; if (ssk->sk_tx_skb_cache) { @@ -1265,22 +1216,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) return true; } - skb = skb_peek(&msk->skb_tx_cache); - if (skb) { - if (likely(sk_wmem_schedule(ssk, skb->truesize))) { - skb = __skb_dequeue(&msk->skb_tx_cache); - if (WARN_ON_ONCE(!skb)) - return false; - - mptcp_wmem_uncharge(sk, skb->truesize); - ssk->sk_tx_skb_cache = skb; - return true; - } - - /* over memory limit, no point to try to allocate a new skb */ - return false; - } - skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return false; @@ -1296,7 +1231,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) { return !ssk->sk_tx_skb_cache && - !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && tcp_under_memory_pressure(sk); } @@ -1339,7 +1273,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); avail_size = info->size_goal; - msk->size_goal_cache = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { /* Limit the write to the size available in the @@ -1688,7 +1621,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; - struct sk_buff_head skbs; bool dfrag_collapsed; size_t psize, offset; @@ -1721,16 +1653,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); total_ts = psize + frag_truesize; - __skb_queue_head_init(&skbs); - if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) - goto wait_for_memory; - if (!mptcp_wmem_alloc(sk, total_ts)) { - __skb_queue_purge(&skbs); + if (!mptcp_wmem_alloc(sk, total_ts)) goto wait_for_memory; - } - skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { mptcp_wmem_uncharge(sk, psize + frag_truesize); @@ -2462,13 +2388,11 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); __skb_queue_head_init(&msk->receive_queue); - __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->wmem_reserved = 0; msk->rmem_released = 0; msk->tx_pending_data = 0; - msk->size_goal_cache = TCP_BASE_MSS; msk->ack_hint = NULL; msk->first = NULL; @@ -2525,15 +2449,10 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - struct sk_buff *skb; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); - while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { - sk->sk_forward_alloc += skb->truesize; - kfree_skb(skb); - } } static void mptcp_cancel_work(struct sock *sk) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 160d716ebc2b7..160c2ab09f194 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -245,9 +245,7 @@ struct mptcp_sock { struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; - struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ int tx_pending_data; - int size_goal_cache; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; From 75e908c33615999abe1f3a8429d25dea30d28e4e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:34 -0700 Subject: [PATCH 2/6] mptcp: use fast lock for subflows when possible There are a bunch of callsite where the ssk socket lock is acquired using the full-blown version eligible for the fast variant. Let's move to the latter. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 10 ++++++---- net/mptcp/protocol.c | 15 +++++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 09722598994d2..d4732a4f223e3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -540,6 +540,7 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); if (subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for %s%s%s", @@ -547,9 +548,9 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "", mptcp_pm_should_add_signal_port(msk) ? " [port]" : ""); - lock_sock(ssk); + slow = lock_sock_fast(ssk); tcp_send_ack(ssk); - release_sock(ssk); + unlock_sock_fast(ssk, slow); spin_lock_bh(&msk->pm.lock); } } @@ -566,6 +567,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *sk = (struct sock *)msk; struct mptcp_addr_info local; + bool slow; local_address((struct sock_common *)ssk, &local); if (!addresses_equal(&local, addr, addr->port)) @@ -578,9 +580,9 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for mp_prio"); - lock_sock(ssk); + slow = lock_sock_fast(ssk); tcp_send_ack(ssk); - release_sock(ssk); + unlock_sock_fast(ssk, slow); spin_lock_bh(&msk->pm.lock); return 0; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 77c90d6f04df5..c47ce074737dc 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -433,23 +433,25 @@ static void mptcp_send_ack(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; - lock_sock(ssk); + slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); - release_sock(ssk); + unlock_sock_fast(ssk, slow); } } static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) { + bool slow; int ret; - lock_sock(ssk); + slow = lock_sock_fast(ssk); ret = tcp_can_send_ack(ssk); if (ret) tcp_cleanup_rbuf(ssk, 1); - release_sock(ssk); + unlock_sock_fast(ssk, slow); return ret; } @@ -2252,13 +2254,14 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + bool slow; - lock_sock(tcp_sk); + slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { tcp_send_active_reset(tcp_sk, GFP_ATOMIC); tcp_set_state(tcp_sk, TCP_CLOSE); } - release_sock(tcp_sk); + unlock_sock_fast(tcp_sk, slow); } inet_sk_state_store(sk, TCP_CLOSE); From 3c90e377a1e87a35a7f868ed1c53ea4d62379a8d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:35 -0700 Subject: [PATCH 3/6] mptcp: don't clear MPTCP_DATA_READY in sk_wait_event() If we don't flush entirely the receive queue, we need set again such bit later. We can simply avoid clearing it. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c47ce074737dc..3e088e9d20fd3 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1715,7 +1715,7 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, timeo, - test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); + test_bit(MPTCP_DATA_READY, &msk->flags), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -2039,10 +2039,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, */ if (unlikely(__mptcp_move_skbs(msk))) set_bit(MPTCP_DATA_READY, &msk->flags); - } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { - /* data to read but mptcp_wait_data() cleared DATA_READY */ - set_bit(MPTCP_DATA_READY, &msk->flags); } + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) From 8cfc47fc2eb0fd2d6eaa9e4b23b4bf6ef1bfaeef Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:36 -0700 Subject: [PATCH 4/6] mptcp: drop redundant test in move_skbs_to_msk() Currently we check the msk state to avoid enqueuing new skbs at msk shutdown time. Such test is racy - as we can't acquire the msk socket lock - and useless, as the caller already checked the subflow field 'disposable', covering the same scenario in a race free manner - read and updated under the ssk socket lock. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3e088e9d20fd3..cf75be02eb001 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -686,9 +686,6 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - if (inet_sk_state_load(sk) == TCP_CLOSE) - return false; - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { From 06285da96a1cdbad265a212f6729e19a515127a2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:37 -0700 Subject: [PATCH 5/6] mptcp: add MIB counter for invalid mapping Account this exceptional events for better introspection. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/subflow.c | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index e7e60bc1fb96c..52ea2517e8560 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -25,6 +25,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 92e56c0cfbdd8..193466c9b5494 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -18,6 +18,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8976ff586b875..585951e7e52fd 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1046,8 +1046,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk, /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ - if (!validate_mapping(ssk, skb)) + if (!validate_mapping(ssk, skb)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; + } skb_ext_del(skb, SKB_EXT_MPTCP); From a4debc4772f44737358ea4210f6fca1f19f5c783 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Mon, 21 Jun 2021 15:54:38 -0700 Subject: [PATCH 6/6] selftests: mptcp: display proper reason to abort tests Without this modification, we were often displaying this error messages: FAIL: Could not even run loopback test But $ret could have been set to a non 0 value in many different cases: - net.mptcp.enabled=0 is not working as expected - setsockopt(..., TCP_ULP, "mptcp", ...) is allowed - ping between each netns are failing - tests between ns1 as a receiver and ns>1 are failing - other tests not involving ns1 as a receiver are failing So not only for the loopback test. Now a clearer message, including the time it took to run all tests, is displayed. Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- .../selftests/net/mptcp/mptcp_connect.sh | 52 +++++++++++++------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 2484fb6a9a8df..559173a8e387b 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -680,6 +680,25 @@ run_tests_peekmode() run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-P ${peekmode}" } +display_time() +{ + time_end=$(date +%s) + time_run=$((time_end-time_start)) + + echo "Time: ${time_run} seconds" +} + +stop_if_error() +{ + local msg="$1" + + if [ ${ret} -ne 0 ]; then + echo "FAIL: ${msg}" 1>&2 + display_time + exit ${ret} + fi +} + make_file "$cin" "client" make_file "$sin" "server" @@ -687,6 +706,8 @@ check_mptcp_disabled check_mptcp_ulp_setsockopt +stop_if_error "The kernel configuration is not valid for MPTCP" + echo "INFO: validating network environment with pings" for sender in "$ns1" "$ns2" "$ns3" "$ns4";do do_ping "$ns1" $sender 10.0.1.1 @@ -706,6 +727,8 @@ for sender in "$ns1" "$ns2" "$ns3" "$ns4";do do_ping "$ns4" $sender dead:beef:3::1 done +stop_if_error "Could not even run ping tests" + [ -n "$tc_loss" ] && tc -net "$ns2" qdisc add dev ns2eth3 root netem loss random $tc_loss delay ${tc_delay}ms echo -n "INFO: Using loss of $tc_loss " test "$tc_delay" -gt 0 && echo -n "delay $tc_delay ms " @@ -733,18 +756,13 @@ echo "on ns3eth4" tc -net "$ns3" qdisc add dev ns3eth4 root netem delay ${reorder_delay}ms $tc_reorder -for sender in $ns1 $ns2 $ns3 $ns4;do - run_tests_lo "$ns1" "$sender" 10.0.1.1 1 - if [ $ret -ne 0 ] ;then - echo "FAIL: Could not even run loopback test" 1>&2 - exit $ret - fi - run_tests_lo "$ns1" $sender dead:beef:1::1 1 - if [ $ret -ne 0 ] ;then - echo "FAIL: Could not even run loopback v6 test" 2>&1 - exit $ret - fi +run_tests_lo "$ns1" "$ns1" 10.0.1.1 1 +stop_if_error "Could not even run loopback test" + +run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 +stop_if_error "Could not even run loopback v6 test" +for sender in $ns1 $ns2 $ns3 $ns4;do # ns1<->ns2 is not subject to reordering/tc delays. Use it to test # mptcp syncookie support. if [ $sender = $ns1 ]; then @@ -753,6 +771,9 @@ for sender in $ns1 $ns2 $ns3 $ns4;do ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=1 fi + run_tests "$ns1" $sender 10.0.1.1 + run_tests "$ns1" $sender dead:beef:1::1 + run_tests "$ns2" $sender 10.0.1.2 run_tests "$ns2" $sender dead:beef:1::2 run_tests "$ns2" $sender 10.0.2.1 @@ -765,14 +786,13 @@ for sender in $ns1 $ns2 $ns3 $ns4;do run_tests "$ns4" $sender 10.0.3.1 run_tests "$ns4" $sender dead:beef:3::1 + + stop_if_error "Tests with $sender as a sender have failed" done run_tests_peekmode "saveWithPeek" run_tests_peekmode "saveAfterPeek" +stop_if_error "Tests with peek mode have failed" -time_end=$(date +%s) -time_run=$((time_end-time_start)) - -echo "Time: ${time_run} seconds" - +display_time exit $ret