Skip to content

Commit

Permalink
Merge branch 'mptcp-improve-backup-subflows'
Browse files Browse the repository at this point in the history
Mat Martineau says:

====================
mptcp: Improve use of backup subflows

Multipath TCP combines multiple TCP subflows in to one stream, and the
MPTCP-level socket must decide which subflow to use when sending (or
resending) chunks of data. The choice of the "best" subflow to transmit
on can vary depending on the priority (normal or backup) for each
subflow and how well the subflow is performing.

In order to improve MPTCP performance when some subflows are failing,
this patch set changes how backup subflows are utilized and introduces
tracking of "stale" subflows that are still connected but not making
progress.

Patch 1 adjusts MPTCP-level retransmit timeouts to use data from all
subflows.

Patch 2 makes MPTCP-level retransmissions less aggressive to avoid
resending data that's still queued at the TCP level.

Patch 3 changes the way pending data is handled when subflows are
closed. Unacked MPTCP-level data still in the subflow tx queue is
immediately moved to another subflow for transmission instead of waiting
for MPTCP-level timeouts to trigger retransmission.

Patch 4 has some sysctl code cleanup.

Patches 5 and 6 add tracking of "stale" subflows, so only underlying TCP
subflow connections that appear to be making progress are considered
when selecting a subflow to (re)transmit data. How fast a subflow goes
stale is configurable with a per-namespace sysctl. Related MIBS are
added too.

Patch 7 makes sure the backup flag is always correctly recorded when the
MP_JOIN SYN/ACK is received for an added subflow.

Patch 8 adds more test cases for backup subflows and stale subflows.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Aug 14, 2021
2 parents e5f3155 + 7d1e6f1 commit 38e3bfa
Show file tree
Hide file tree
Showing 11 changed files with 464 additions and 88 deletions.
12 changes: 12 additions & 0 deletions Documentation/networking/mptcp-sysctl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,15 @@ allow_join_initial_addr_port - BOOLEAN
This is a per-namespace sysctl.

Default: 1

stale_loss_cnt - INTEGER
The number of MPTCP-level retransmission intervals with no traffic and
pending outstanding data on a given subflow required to declare it stale.
The packet scheduler ignores stale subflows.
A low stale_loss_cnt value allows for fast active-backup switch-over,
an high value maximize links utilization on edge scenarios e.g. lossy
link with high BER or peer pausing the data processing.

This is a per-namespace sysctl.

Default: 4
26 changes: 20 additions & 6 deletions net/mptcp/ctrl.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,43 +21,50 @@ struct mptcp_pernet {
struct ctl_table_header *ctl_table_hdr;
#endif

u8 mptcp_enabled;
unsigned int add_addr_timeout;
unsigned int stale_loss_cnt;
u8 mptcp_enabled;
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
};

static struct mptcp_pernet *mptcp_get_pernet(struct net *net)
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
{
return net_generic(net, mptcp_pernet_id);
}

int mptcp_is_enabled(struct net *net)
int mptcp_is_enabled(const struct net *net)
{
return mptcp_get_pernet(net)->mptcp_enabled;
}

unsigned int mptcp_get_add_addr_timeout(struct net *net)
unsigned int mptcp_get_add_addr_timeout(const struct net *net)
{
return mptcp_get_pernet(net)->add_addr_timeout;
}

int mptcp_is_checksum_enabled(struct net *net)
int mptcp_is_checksum_enabled(const struct net *net)
{
return mptcp_get_pernet(net)->checksum_enabled;
}

int mptcp_allow_join_id0(struct net *net)
int mptcp_allow_join_id0(const struct net *net)
{
return mptcp_get_pernet(net)->allow_join_initial_addr_port;
}

unsigned int mptcp_stale_loss_cnt(const struct net *net)
{
return mptcp_get_pernet(net)->stale_loss_cnt;
}

static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX;
pernet->checksum_enabled = 0;
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
}

#ifdef CONFIG_SYSCTL
Expand Down Expand Up @@ -95,6 +102,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE
},
{
.procname = "stale_loss_cnt",
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
},
{}
};

Expand All @@ -114,6 +127,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[1].data = &pernet->add_addr_timeout;
table[2].data = &pernet->checksum_enabled;
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;

hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
Expand Down
2 changes: 2 additions & 0 deletions net/mptcp/mib.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE),
SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER),
SNMP_MIB_SENTINEL
};

Expand Down
2 changes: 2 additions & 0 deletions net/mptcp/mib.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */
MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */
MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */
MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */
MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */
__MPTCP_MIB_MAX
};

Expand Down
8 changes: 5 additions & 3 deletions net/mptcp/options.c
Original file line number Diff line number Diff line change
Expand Up @@ -975,9 +975,11 @@ static void ack_update_msk(struct mptcp_sock *msk,
old_snd_una = msk->snd_una;
new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64);

/* ACK for data not even sent yet? Ignore. */
if (after64(new_snd_una, snd_nxt))
new_snd_una = old_snd_una;
/* ACK for data not even sent yet and even above recovery bound? Ignore.*/
if (unlikely(after64(new_snd_una, snd_nxt))) {
if (!msk->recovery || after64(new_snd_una, msk->recovery_snd_nxt))
new_snd_una = old_snd_una;
}

new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;

Expand Down
21 changes: 21 additions & 0 deletions net/mptcp/pm.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <net/mptcp.h>
#include "protocol.h"

#include "mib.h"

/* path manager command handlers */

int mptcp_pm_announce_addr(struct mptcp_sock *msk,
Expand Down Expand Up @@ -308,6 +310,25 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
return mptcp_pm_nl_get_local_id(msk, skc);
}

void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp);

/* keep track of rtx periods with no progress */
if (!subflow->stale_count) {
subflow->stale_rcv_tstamp = rcv_tstamp;
subflow->stale_count++;
} else if (subflow->stale_rcv_tstamp == rcv_tstamp) {
if (subflow->stale_count < U8_MAX)
subflow->stale_count++;
mptcp_pm_nl_subflow_chk_stale(msk, ssk);
} else {
subflow->stale_count = 0;
mptcp_subflow_set_active(subflow);
}
}

void mptcp_pm_data_init(struct mptcp_sock *msk)
{
msk->pm.add_addr_signaled = 0;
Expand Down
39 changes: 39 additions & 0 deletions net/mptcp/pm_netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct pm_nl_pernet {
spinlock_t lock;
struct list_head local_addr_list;
unsigned int addrs;
unsigned int stale_loss_cnt;
unsigned int add_addr_signal_max;
unsigned int add_addr_accept_max;
unsigned int local_addr_max;
Expand Down Expand Up @@ -899,6 +900,43 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
[MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
};

void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
{
struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = (struct sock *)msk;
unsigned int active_max_loss_cnt;
struct net *net = sock_net(sk);
unsigned int stale_loss_cnt;
bool slow;

stale_loss_cnt = mptcp_stale_loss_cnt(net);
if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt)
return;

/* look for another available subflow not in loss state */
active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1);
mptcp_for_each_subflow(msk, iter) {
if (iter != subflow && mptcp_subflow_active(iter) &&
iter->stale_count < active_max_loss_cnt) {
/* we have some alternatives, try to mark this subflow as idle ...*/
slow = lock_sock_fast(ssk);
if (!tcp_rtx_and_write_queues_empty(ssk)) {
subflow->stale = 1;
__mptcp_retransmit_pending_data(sk);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_SUBFLOWSTALE);
}
unlock_sock_fast(ssk, slow);

/* always try to push the pending data regarless of re-injections:
* we can possibly use backup subflows now, and subflow selection
* is cheap under the msk socket lock
*/
__mptcp_push_pending(sk, 0);
return;
}
}
}

static int mptcp_pm_family_to_addr(int family)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
Expand Down Expand Up @@ -1922,6 +1960,7 @@ static int __net_init pm_nl_init_net(struct net *net)

INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
pernet->next_id = 1;
pernet->stale_loss_cnt = 4;
spin_lock_init(&pernet->lock);

/* No need to initialize other pernet fields, the struct is zeroed at
Expand Down
Loading

0 comments on commit 38e3bfa

Please sign in to comment.