Skip to content

Commit

Permalink
Merge branch 'mptcp-miscellaneous-mptcp-fixes'
Browse files Browse the repository at this point in the history
Mat Martineau says:

====================
mptcp: Miscellaneous MPTCP fixes

This is a collection of small fixup and minor enhancement patches that
have accumulated in the MPTCP tree while net-next was closed. These are
prerequisites for larger changes we have queued up.

Patch 1 refines receive buffer autotuning.

Patches 2 and 4 are some minor locking and refactoring changes.

Patch 3 improves GRO and RX coalescing with MPTCP skbs.

Patches 5-7 add a sysctl for tuning ADD_ADDR retransmission timeout,
corresponding test code, and documentation.

v2: Add sysctl documentation and fix signoff tags.
====================

Link: https://lore.kernel.org/r/20201103190509.27416-1-mathew.j.martineau@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Nov 5, 2020
2 parents 82728b9 + 8d014ea commit ae23b55
Showing 11 changed files with 199 additions and 47 deletions.
1 change: 1 addition & 0 deletions Documentation/networking/index.rst
Original file line number Diff line number Diff line change
@@ -70,6 +70,7 @@ Contents:
lapb-module
mac80211-injection
mpls-sysctl
mptcp-sysctl
multiqueue
netconsole
netdev-features
26 changes: 26 additions & 0 deletions Documentation/networking/mptcp-sysctl.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
.. SPDX-License-Identifier: GPL-2.0
=====================
MPTCP Sysfs variables
=====================

/proc/sys/net/mptcp/* Variables
===============================

enabled - INTEGER
Control whether MPTCP sockets can be created.

MPTCP sockets can be created if the value is nonzero. This is
a per-namespace sysctl.

Default: 1

add_addr_timeout - INTEGER (seconds)
Set the timeout after which an ADD_ADDR control message will be
resent to an MPTCP peer that has not acknowledged a previous
ADD_ADDR message.

The default value matches TCP_RTO_MAX. This is a per-namespace
sysctl.

Default: 120
1 change: 1 addition & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
@@ -12265,6 +12265,7 @@ L: mptcp@lists.01.org
S: Maintained
W: https://github.com/multipath-tcp/mptcp_net-next/wiki
B: https://github.com/multipath-tcp/mptcp_net-next/issues
F: Documentation/networking/mptcp-sysctl.rst
F: include/net/mptcp.h
F: include/uapi/linux/mptcp.h
F: net/mptcp/
21 changes: 20 additions & 1 deletion include/net/mptcp.h
Original file line number Diff line number Diff line change
@@ -29,7 +29,8 @@ struct mptcp_ext {
use_ack:1,
ack64:1,
mpc_map:1,
__unused:2;
frozen:1,
__unused:1;
/* one byte hole */
};

@@ -106,6 +107,19 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to,
from->active_extensions = 0;
}

static inline void mptcp_skb_ext_copy(struct sk_buff *to,
struct sk_buff *from)
{
struct mptcp_ext *from_ext;

from_ext = skb_ext_find(from, SKB_EXT_MPTCP);
if (!from_ext)
return;

from_ext->frozen = 1;
skb_ext_copy(to, from);
}

static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext,
const struct mptcp_ext *from_ext)
{
@@ -193,6 +207,11 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to,
{
}

static inline void mptcp_skb_ext_copy(struct sk_buff *to,
struct sk_buff *from)
{
}

static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
const struct sk_buff *from)
{
3 changes: 3 additions & 0 deletions net/ipv4/tcp_output.c
Original file line number Diff line number Diff line change
@@ -1569,6 +1569,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
if (!buff)
return -ENOMEM; /* We'll just try again later. */
skb_copy_decrypted(buff, skb);
mptcp_skb_ext_copy(buff, skb);

sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
@@ -2123,6 +2124,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (unlikely(!buff))
return -ENOMEM;
skb_copy_decrypted(buff, skb);
mptcp_skb_ext_copy(buff, skb);

sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
@@ -2393,6 +2395,7 @@ static int tcp_mtu_probe(struct sock *sk)

skb = tcp_send_head(sk);
skb_copy_decrypted(nskb, skb);
mptcp_skb_ext_copy(nskb, skb);

TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
14 changes: 14 additions & 0 deletions net/mptcp/ctrl.c
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@ struct mptcp_pernet {
struct ctl_table_header *ctl_table_hdr;

int mptcp_enabled;
unsigned int add_addr_timeout;
};

static struct mptcp_pernet *mptcp_get_pernet(struct net *net)
@@ -30,6 +31,11 @@ int mptcp_is_enabled(struct net *net)
return mptcp_get_pernet(net)->mptcp_enabled;
}

unsigned int mptcp_get_add_addr_timeout(struct net *net)
{
return mptcp_get_pernet(net)->add_addr_timeout;
}

static struct ctl_table mptcp_sysctl_table[] = {
{
.procname = "enabled",
@@ -40,12 +46,19 @@ static struct ctl_table mptcp_sysctl_table[] = {
*/
.proc_handler = proc_dointvec,
},
{
.procname = "add_addr_timeout",
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{}
};

static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX;
}

static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
@@ -61,6 +74,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
}

table[0].data = &pernet->mptcp_enabled;
table[1].data = &pernet->add_addr_timeout;

hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
8 changes: 6 additions & 2 deletions net/mptcp/pm_netlink.c
Original file line number Diff line number Diff line change
@@ -206,6 +206,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer);
struct mptcp_sock *msk = entry->sock;
struct sock *sk = (struct sock *)msk;
struct net *net = sock_net(sk);

pr_debug("msk=%p", msk);

@@ -232,7 +233,8 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
}

if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX);
sk_reset_timer(sk, timer,
jiffies + mptcp_get_add_addr_timeout(net));

spin_unlock_bh(&msk->pm.lock);

@@ -264,6 +266,7 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
{
struct mptcp_pm_add_entry *add_entry = NULL;
struct sock *sk = (struct sock *)msk;
struct net *net = sock_net(sk);

if (lookup_anno_list_by_saddr(msk, &entry->addr))
return false;
@@ -279,7 +282,8 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
add_entry->retrans_times = 0;

timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
sk_reset_timer(sk, &add_entry->add_timer, jiffies + TCP_RTO_MAX);
sk_reset_timer(sk, &add_entry->add_timer,
jiffies + mptcp_get_add_addr_timeout(net));

return true;
}
67 changes: 47 additions & 20 deletions net/mptcp/protocol.c
Original file line number Diff line number Diff line change
@@ -466,6 +466,18 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct tcp_sock *tp;
u32 old_copied_seq;
bool done = false;
int sk_rbuf;

sk_rbuf = READ_ONCE(sk->sk_rcvbuf);

if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);

if (unlikely(ssk_rbuf > sk_rbuf)) {
WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
sk_rbuf = ssk_rbuf;
}
}

pr_debug("msk=%p ssk=%p", msk, ssk);
tp = tcp_sk(ssk);
@@ -528,7 +540,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
WRITE_ONCE(tp->copied_seq, seq);
more_data_avail = mptcp_subflow_data_available(ssk);

if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) {
if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
done = true;
break;
}
@@ -622,6 +634,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(sk);
int sk_rbuf, ssk_rbuf;
bool wake;

/* move_skbs_to_msk below can legitly clear the data_avail flag,
@@ -632,12 +645,16 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
if (wake)
set_bit(MPTCP_DATA_READY, &msk->flags);

if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
move_skbs_to_msk(msk, ssk))
ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
if (unlikely(ssk_rbuf > sk_rbuf))
sk_rbuf = ssk_rbuf;

/* over limit? can't append more skbs to msk */
if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
goto wake;

/* don't schedule if mptcp sk is (still) over limit */
if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
if (move_skbs_to_msk(msk, ssk))
goto wake;

/* mptcp socket is owned, release_cb should retry */
@@ -754,8 +771,11 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq,
if (!tcp_skb_can_collapse_to(skb))
return false;

/* can collapse only if MPTCP level sequence is in order */
return mpext && mpext->data_seq + mpext->data_len == write_seq;
/* can collapse only if MPTCP level sequence is in order and this
* mapping has not been xmitted yet
*/
return mpext && mpext->data_seq + mpext->data_len == write_seq &&
!mpext->frozen;
}

static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
@@ -833,19 +853,25 @@ static void mptcp_clean_una(struct sock *sk)
}

out:
if (cleaned) {
if (cleaned)
sk_mem_reclaim_partial(sk);
}

/* Only wake up writers if a subflow is ready */
if (mptcp_is_writeable(msk)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);
smp_mb__after_atomic();
static void mptcp_clean_una_wakeup(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);

/* set SEND_SPACE before sk_stream_write_space clears
* NOSPACE
*/
sk_stream_write_space(sk);
}
mptcp_clean_una(sk);

/* Only wake up writers if a subflow is ready */
if (mptcp_is_writeable(msk)) {
set_bit(MPTCP_SEND_SPACE, &msk->flags);
smp_mb__after_atomic();

/* set SEND_SPACE before sk_stream_write_space clears
* NOSPACE
*/
sk_stream_write_space(sk);
}
}

@@ -1476,13 +1502,14 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
__mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
bool slowpath;

if (!ssk)
break;

lock_sock(ssk);
slowpath = lock_sock_fast(ssk);
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
release_sock(ssk);
unlock_sock_fast(ssk, slowpath);
} while (!done);

if (mptcp_ofo_queue(msk) || moved > 0) {
@@ -1748,7 +1775,7 @@ static void mptcp_worker(struct work_struct *work)
long timeo = 0;

lock_sock(sk);
mptcp_clean_una(sk);
mptcp_clean_una_wakeup(sk);
mptcp_check_data_fin_ack(sk);
__mptcp_flush_join_list(msk);
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
1 change: 1 addition & 0 deletions net/mptcp/protocol.h
Original file line number Diff line number Diff line change
@@ -362,6 +362,7 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
}

int mptcp_is_enabled(struct net *net);
unsigned int mptcp_get_add_addr_timeout(struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool mptcp_subflow_data_available(struct sock *sk);
10 changes: 10 additions & 0 deletions tools/testing/selftests/net/mptcp/config
Original file line number Diff line number Diff line change
@@ -5,3 +5,13 @@ CONFIG_INET_DIAG=m
CONFIG_INET_MPTCP_DIAG=m
CONFIG_VETH=y
CONFIG_NET_SCH_NETEM=m
CONFIG_NETFILTER=y
CONFIG_NETFILTER_ADVANCED=y
CONFIG_NETFILTER_NETLINK=m
CONFIG_NF_TABLES=m
CONFIG_NFT_COUNTER=m
CONFIG_NFT_COMPAT=m
CONFIG_NETFILTER_XTABLES=m
CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NF_TABLES_IPV4=y
CONFIG_NF_TABLES_IPV6=y
94 changes: 70 additions & 24 deletions tools/testing/selftests/net/mptcp/mptcp_join.sh
Original file line number Diff line number Diff line change
@@ -13,6 +13,24 @@ capture=0

TEST_COUNT=0

# generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) ||
# (ip6 && (ip6[74] & 0xf0) == 0x30)'"
CBPF_MPTCP_SUBOPTION_ADD_ADDR="14,
48 0 0 0,
84 0 0 240,
21 0 3 64,
48 0 0 54,
84 0 0 240,
21 6 7 48,
48 0 0 0,
84 0 0 240,
21 0 4 96,
48 0 0 74,
84 0 0 240,
21 0 1 48,
6 0 0 65535,
6 0 0 0"

init()
{
capout=$(mktemp)
@@ -82,6 +100,26 @@ reset_with_cookies()
done
}

reset_with_add_addr_timeout()
{
local ip="${1:-4}"
local tables

tables="iptables"
if [ $ip -eq 6 ]; then
tables="ip6tables"
fi

reset

ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1
ip netns exec $ns2 $tables -A OUTPUT -p tcp \
-m tcp --tcp-option 30 \
-m bpf --bytecode \
"$CBPF_MPTCP_SUBOPTION_ADD_ADDR" \
-j DROP
}

for arg in "$@"; do
if [ "$arg" = "-c" ]; then
capture=1
@@ -94,6 +132,17 @@ if [ $? -ne 0 ];then
exit $ksft_skip
fi

iptables -V > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run all tests without iptables tool"
exit $ksft_skip
fi

ip6tables -V > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run all tests without ip6tables tool"
exit $ksft_skip
fi

check_transfer()
{
@@ -135,6 +184,7 @@ do_transfer()
connect_addr="$5"
rm_nr_ns1="$6"
rm_nr_ns2="$7"
speed="$8"

port=$((10000+$TEST_COUNT))
TEST_COUNT=$((TEST_COUNT+1))
@@ -159,7 +209,7 @@ do_transfer()
sleep 1
fi

if [[ $rm_nr_ns1 -eq 0 && $rm_nr_ns2 -eq 0 ]]; then
if [ $speed = "fast" ]; then
mptcp_connect="./mptcp_connect -j"
else
mptcp_connect="./mptcp_connect -r"
@@ -250,26 +300,13 @@ run_tests()
listener_ns="$1"
connector_ns="$2"
connect_addr="$3"
rm_nr_ns1="${4:-0}"
rm_nr_ns2="${5:-0}"
speed="${6:-fast}"
lret=0

do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} 0 0
lret=$?
if [ $lret -ne 0 ]; then
ret=$lret
return
fi
}

run_remove_tests()
{
listener_ns="$1"
connector_ns="$2"
connect_addr="$3"
rm_nr_ns1="$4"
rm_nr_ns2="$5"
lret=0

do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${rm_nr_ns1} ${rm_nr_ns2}
do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \
${rm_nr_ns1} ${rm_nr_ns2} ${speed}
lret=$?
if [ $lret -ne 0 ]; then
ret=$lret
@@ -491,12 +528,21 @@ run_tests $ns1 $ns2 10.0.1.1
chk_join_nr "multiple subflows and signal" 3 3 3
chk_add_nr 1 1

# add_addr timeout
reset_with_add_addr_timeout
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 1 1
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
run_tests $ns1 $ns2 10.0.1.1 0 0 slow
chk_join_nr "signal address, ADD_ADDR timeout" 1 1 1
chk_add_nr 4 0

# single subflow, remove
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl limits 0 1
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_remove_tests $ns1 $ns2 10.0.1.1 0 1
run_tests $ns1 $ns2 10.0.1.1 0 1 slow
chk_join_nr "remove single subflow" 1 1 1
chk_rm_nr 1 1

@@ -506,7 +552,7 @@ ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl limits 0 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_remove_tests $ns1 $ns2 10.0.1.1 0 2
run_tests $ns1 $ns2 10.0.1.1 0 2 slow
chk_join_nr "remove multiple subflows" 2 2 2
chk_rm_nr 2 2

@@ -515,7 +561,7 @@ reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 1
run_remove_tests $ns1 $ns2 10.0.1.1 1 0
run_tests $ns1 $ns2 10.0.1.1 1 0 slow
chk_join_nr "remove single address" 1 1 1
chk_add_nr 1 1
chk_rm_nr 0 0
@@ -526,7 +572,7 @@ ip netns exec $ns1 ./pm_nl_ctl limits 0 2
ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 2
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
run_remove_tests $ns1 $ns2 10.0.1.1 1 1
run_tests $ns1 $ns2 10.0.1.1 1 1 slow
chk_join_nr "remove subflow and signal" 2 2 2
chk_add_nr 1 1
chk_rm_nr 1 1
@@ -538,7 +584,7 @@ ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
ip netns exec $ns2 ./pm_nl_ctl limits 1 3
ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
run_remove_tests $ns1 $ns2 10.0.1.1 1 2
run_tests $ns1 $ns2 10.0.1.1 1 2 slow
chk_join_nr "remove subflows and signal" 3 3 3
chk_add_nr 1 1
chk_rm_nr 2 2

0 comments on commit ae23b55

Please sign in to comment.