Skip to content

Commit

Permalink
Merge branch 'mptcp-path-manager-mode-selection'
Browse files Browse the repository at this point in the history
Mat Martineau says:

====================
mptcp: Path manager mode selection

MPTCP already has an in-kernel path manager (PM) to add and remove TCP
subflows associated with a given MPTCP connection. This in-kernel PM has
been designed to handle typical server-side use cases, but is not very
flexible or configurable for client devices that may have more
complicated policies to implement.

This patch series from the MPTCP tree is the first step toward adding a
generic-netlink-based API for MPTCP path management, which a privileged
userspace daemon will be able to use to control subflow
establishment. These patches add a per-namespace sysctl to select the
default PM type (in-kernel or userspace) for new MPTCP sockets. New
self-tests confirm expected behavior when userspace PM is selected but
there is no daemon available to handle existing MPTCP PM events.

Subsequent patch series (already staged in the MPTCP tree) will add the
generic netlink path management API.
====================

Link: https://lore.kernel.org/r/20220427225002.231996-1-mathew.j.martineau@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Apr 30, 2022
2 parents a41c653 + 5ac1d2d commit 4994d4f
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 34 deletions.
18 changes: 18 additions & 0 deletions Documentation/networking/mptcp-sysctl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,24 @@ allow_join_initial_addr_port - BOOLEAN

Default: 1

pm_type - INTEGER

Set the default path manager type to use for each new MPTCP
socket. In-kernel path management will control subflow
connections and address advertisements according to
per-namespace values configured over the MPTCP netlink
API. Userspace path management puts per-MPTCP-connection subflow
connection decisions and address advertisements under control of
a privileged userspace program, at the cost of more netlink
traffic to propagate all of the related events and commands.

This is a per-namespace sysctl.

* 0 - In-kernel path manager
* 1 - Userspace path manager

Default: 0

stale_loss_cnt - INTEGER
The number of MPTCP-level retransmission intervals with no traffic and
pending outstanding data on a given subflow required to declare it stale.
Expand Down
21 changes: 21 additions & 0 deletions net/mptcp/ctrl.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
#define MPTCP_SYSCTL_PATH "net/mptcp"

static int mptcp_pernet_id;

#ifdef CONFIG_SYSCTL
static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX;
#endif

struct mptcp_pernet {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *ctl_table_hdr;
Expand All @@ -26,6 +31,7 @@ struct mptcp_pernet {
u8 mptcp_enabled;
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
u8 pm_type;
};

static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
Expand Down Expand Up @@ -58,13 +64,19 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net)
return mptcp_get_pernet(net)->stale_loss_cnt;
}

int mptcp_get_pm_type(const struct net *net)
{
return mptcp_get_pernet(net)->pm_type;
}

static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX;
pernet->checksum_enabled = 0;
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
}

#ifdef CONFIG_SYSCTL
Expand Down Expand Up @@ -108,6 +120,14 @@ static struct ctl_table mptcp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
},
{
.procname = "pm_type",
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &mptcp_pm_type_max
},
{}
};

Expand All @@ -128,6 +148,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[2].data = &pernet->checksum_enabled;
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type;

hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
Expand Down
50 changes: 35 additions & 15 deletions net/mptcp/pm.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,

spin_lock_bh(&pm->lock);

if (!READ_ONCE(pm->accept_addr)) {
if (!READ_ONCE(pm->accept_addr) || mptcp_pm_is_userspace(msk)) {
mptcp_pm_announce_addr(msk, addr, true);
mptcp_pm_add_addr_send_ack(msk);
} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
Expand Down Expand Up @@ -415,21 +415,41 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)

void mptcp_pm_data_reset(struct mptcp_sock *msk)
{
msk->pm.add_addr_signaled = 0;
msk->pm.add_addr_accepted = 0;
msk->pm.local_addr_used = 0;
msk->pm.subflows = 0;
msk->pm.rm_list_tx.nr = 0;
msk->pm.rm_list_rx.nr = 0;
WRITE_ONCE(msk->pm.work_pending, false);
WRITE_ONCE(msk->pm.addr_signal, 0);
WRITE_ONCE(msk->pm.accept_addr, false);
WRITE_ONCE(msk->pm.accept_subflow, false);
WRITE_ONCE(msk->pm.remote_deny_join_id0, false);
msk->pm.status = 0;
bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
struct mptcp_pm_data *pm = &msk->pm;

pm->add_addr_signaled = 0;
pm->add_addr_accepted = 0;
pm->local_addr_used = 0;
pm->subflows = 0;
pm->rm_list_tx.nr = 0;
pm->rm_list_rx.nr = 0;
WRITE_ONCE(pm->pm_type, pm_type);

if (pm_type == MPTCP_PM_TYPE_KERNEL) {
bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);

/* pm->work_pending must be only be set to 'true' when
* pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
*/
WRITE_ONCE(pm->work_pending,
(!!mptcp_pm_get_local_addr_max(msk) &&
subflows_allowed) ||
!!mptcp_pm_get_add_addr_signal_max(msk));
WRITE_ONCE(pm->accept_addr,
!!mptcp_pm_get_add_addr_accept_max(msk) &&
subflows_allowed);
WRITE_ONCE(pm->accept_subflow, subflows_allowed);
} else {
WRITE_ONCE(pm->work_pending, 0);
WRITE_ONCE(pm->accept_addr, 0);
WRITE_ONCE(pm->accept_subflow, 0);
}

mptcp_pm_nl_data_init(msk);
WRITE_ONCE(pm->addr_signal, 0);
WRITE_ONCE(pm->remote_deny_join_id0, false);
pm->status = 0;
bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
}

void mptcp_pm_data_init(struct mptcp_sock *msk)
Expand Down
30 changes: 12 additions & 18 deletions net/mptcp/pm_netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -1061,18 +1061,6 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
return ret;
}

void mptcp_pm_nl_data_init(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
bool subflows;

subflows = !!mptcp_pm_get_subflows_max(msk);
WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows) ||
!!mptcp_pm_get_add_addr_signal_max(msk));
WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows);
WRITE_ONCE(pm->accept_subflow, subflows);
}

#define MPTCP_PM_CMD_GRP_OFFSET 0
#define MPTCP_PM_EV_GRP_OFFSET 1

Expand Down Expand Up @@ -1232,7 +1220,8 @@ static int mptcp_nl_add_subflow_or_signal_addr(struct net *net)
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;

if (!READ_ONCE(msk->fully_established))
if (!READ_ONCE(msk->fully_established) ||
mptcp_pm_is_userspace(msk))
goto next;

lock_sock(sk);
Expand Down Expand Up @@ -1375,6 +1364,9 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
struct sock *sk = (struct sock *)msk;
bool remove_subflow;

if (mptcp_pm_is_userspace(msk))
goto next;

if (list_empty(&msk->conn_list)) {
mptcp_pm_remove_anno_addr(msk, addr, false);
goto next;
Expand Down Expand Up @@ -1409,7 +1401,7 @@ static int mptcp_nl_remove_id_zero_address(struct net *net,
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info msk_local;

if (list_empty(&msk->conn_list))
if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;

local_address((struct sock_common *)msk, &msk_local);
Expand Down Expand Up @@ -1516,9 +1508,11 @@ static void mptcp_nl_remove_addrs_list(struct net *net,
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;

lock_sock(sk);
mptcp_pm_remove_addrs_and_subflows(msk, rm_list);
release_sock(sk);
if (!mptcp_pm_is_userspace(msk)) {
lock_sock(sk);
mptcp_pm_remove_addrs_and_subflows(msk, rm_list);
release_sock(sk);
}

sock_put(sk);
cond_resched();
Expand Down Expand Up @@ -1791,7 +1785,7 @@ static int mptcp_nl_set_flags(struct net *net,
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;

if (list_empty(&msk->conn_list))
if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;

lock_sock(sk);
Expand Down
16 changes: 15 additions & 1 deletion net/mptcp/protocol.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,14 @@ enum mptcp_pm_status {
*/
};

enum mptcp_pm_type {
MPTCP_PM_TYPE_KERNEL = 0,
MPTCP_PM_TYPE_USERSPACE,

__MPTCP_PM_TYPE_NR,
__MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1,
};

/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */
#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1)

Expand Down Expand Up @@ -212,6 +220,7 @@ struct mptcp_pm_data {
u8 add_addr_signaled;
u8 add_addr_accepted;
u8 local_addr_used;
u8 pm_type;
u8 subflows;
u8 status;
DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
Expand Down Expand Up @@ -576,6 +585,7 @@ unsigned int mptcp_get_add_addr_timeout(const struct net *net);
int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
Expand Down Expand Up @@ -796,6 +806,11 @@ static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk)
return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL);
}

static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk)
{
return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE;
}

static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port)
{
u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
Expand Down Expand Up @@ -828,7 +843,6 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);

void __init mptcp_pm_nl_init(void);
void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
void mptcp_pm_nl_work(struct mptcp_sock *msk);
void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list);
Expand Down
66 changes: 66 additions & 0 deletions tools/testing/selftests/net/mptcp/mptcp_join.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ init_partial()
ip netns add $netns || exit $ksft_skip
ip -net $netns link set lo up
ip netns exec $netns sysctl -q net.mptcp.enabled=1
ip netns exec $netns sysctl -q net.mptcp.pm_type=0
ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0
ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0
if [ $checksum -eq 1 ]; then
Expand Down Expand Up @@ -1611,6 +1612,13 @@ wait_attempt_fail()
return 1
}

set_userspace_pm()
{
local ns=$1

ip netns exec $ns sysctl -q net.mptcp.pm_type=1
}

subflows_tests()
{
if reset "no JOIN"; then
Expand Down Expand Up @@ -2698,6 +2706,63 @@ fail_tests()
fi
}

userspace_tests()
{
# userspace pm type prevents add_addr
if reset "userspace pm type prevents add_addr"; then
set_userspace_pm $ns1
pm_nl_set_limits $ns1 0 2
pm_nl_set_limits $ns2 0 2
pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 0 0 0
chk_add_nr 0 0
fi

# userspace pm type rejects join
if reset "userspace pm type rejects join"; then
set_userspace_pm $ns1
pm_nl_set_limits $ns1 1 1
pm_nl_set_limits $ns2 1 1
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 1 1 0
fi

# userspace pm type does not send join
if reset "userspace pm type does not send join"; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 1 1
pm_nl_set_limits $ns2 1 1
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 0 0 0
fi

# userspace pm type prevents mp_prio
if reset "userspace pm type prevents mp_prio"; then
set_userspace_pm $ns1
pm_nl_set_limits $ns1 1 1
pm_nl_set_limits $ns2 1 1
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup
chk_join_nr 1 1 0
chk_prio_nr 0 0
fi

# userspace pm type prevents rm_addr
if reset "userspace pm type prevents rm_addr"; then
set_userspace_pm $ns1
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
pm_nl_set_limits $ns2 0 1
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1 0 0 -1 slow
chk_join_nr 0 0 0
chk_rm_nr 0 0
fi
}

implicit_tests()
{
# userspace pm type prevents add_addr
Expand Down Expand Up @@ -2767,6 +2832,7 @@ all_tests_sorted=(
m@fullmesh_tests
z@fastclose_tests
F@fail_tests
u@userspace_tests
I@implicit_tests
)

Expand Down

0 comments on commit 4994d4f

Please sign in to comment.