Skip to content

Commit

Permalink
Merge branch 'mptcp-re-enable-sndbuf-autotune'
Browse files Browse the repository at this point in the history
Paolo Abeni says:

====================
mptcp: re-enable sndbuf autotune

The sendbuffer autotuning was unintentionally disabled as a
side effect of the recent workqueue removal refactor. These
patches re-enable id, with some extra care: with autotuning
enable/large send buffer we need a more accurate packet
scheduler to be able to use efficiently the available
subflow bandwidth, especially when the subflows have
different capacities.

The first patch cleans-up subflow socket handling, making
the actual re-enable (patch 2) simpler.

Patches 3 and 4 improve the packet scheduler, to better cope
with non trivial scenarios and large send buffer.

Finally patch 5 adds and uses some infrastructure to avoid
the workqueue usage for the packet scheduler operations introduced
by the previous patches.
====================

Link: https://lore.kernel.org/r/cover.1611153172.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Jan 23, 2021
2 parents e26ca4b + b19bc29 commit 07fe179
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 63 deletions.
179 changes: 117 additions & 62 deletions net/mptcp/protocol.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ static struct percpu_counter mptcp_sockets_allocated;
static void __mptcp_destroy_sock(struct sock *sk);
static void __mptcp_check_send_data_fin(struct sock *sk);

DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
static struct net_device mptcp_napi_dev;

/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket.
* Otherwise return NULL.
Expand Down Expand Up @@ -114,11 +117,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
list_add(&subflow->node, &msk->conn_list);
sock_hold(ssock->sk);
subflow->request_mptcp = 1;

/* accept() will wait on first subflow sk_wq, and we always wakes up
* via msk->sk_socket
*/
RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
mptcp_sock_graft(msk->first, sk->sk_socket);

return 0;
}
Expand Down Expand Up @@ -734,10 +733,14 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)

void __mptcp_flush_join_list(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;

if (likely(list_empty(&msk->join_list)))
return;

spin_lock_bh(&msk->join_list_lock);
list_for_each_entry(subflow, &msk->join_list, node)
mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
list_splice_tail_init(&msk->join_list, &msk->conn_list);
spin_unlock_bh(&msk->join_list_lock);
}
Expand Down Expand Up @@ -1037,13 +1040,6 @@ static void __mptcp_clean_una(struct sock *sk)
__mptcp_update_wmem(sk);
sk_mem_reclaim_partial(sk);
}

if (sk_stream_is_writeable(sk)) {
/* pairs with memory barrier in mptcp_poll */
smp_mb();
if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags))
sk_stream_write_space(sk);
}
}

if (snd_una == READ_ONCE(msk->snd_nxt)) {
Expand Down Expand Up @@ -1362,8 +1358,7 @@ struct subflow_send_info {
u64 ratio;
};

static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
u32 *sndbuf)
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
Expand All @@ -1374,24 +1369,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,

sock_owned_by_me((struct sock *)msk);

*sndbuf = 0;
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
*sndbuf = msk->first->sk_sndbuf;
return sk_stream_memory_free(msk->first) ? msk->first : NULL;
}

/* re-use last subflow, if the burst allow that */
if (msk->last_snd && msk->snd_burst > 0 &&
sk_stream_memory_free(msk->last_snd) &&
mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
mptcp_for_each_subflow(msk, subflow) {
ssk = mptcp_subflow_tcp_sock(subflow);
*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
}
mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd)))
return msk->last_snd;
}

/* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < 2; ++i) {
Expand All @@ -1404,8 +1392,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
continue;

nr_active += !subflow->backup;
*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
if (!sk_stream_memory_free(subflow->tcp_sock))
if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
continue;

pace = READ_ONCE(ssk->sk_pacing_rate);
Expand All @@ -1431,9 +1418,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
if (send_info[0].ssk) {
msk->last_snd = send_info[0].ssk;
msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
sk_stream_wspace(msk->last_snd));
tcp_sk(msk->last_snd)->snd_wnd);
return msk->last_snd;
}

return NULL;
}

Expand All @@ -1454,7 +1442,6 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags)
};
struct mptcp_data_frag *dfrag;
int len, copied = 0;
u32 sndbuf;

while ((dfrag = mptcp_send_head(sk))) {
info.sent = dfrag->already_sent;
Expand All @@ -1465,12 +1452,7 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags)

prev_ssk = ssk;
__mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk, &sndbuf);

/* do auto tuning */
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
sndbuf > READ_ONCE(sk->sk_sndbuf))
WRITE_ONCE(sk->sk_sndbuf, sndbuf);
ssk = mptcp_subflow_get_send(msk);

/* try to keep the subflow socket lock across
* consecutive xmit on the same socket
Expand Down Expand Up @@ -1527,7 +1509,9 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_sendmsg_info info;
struct mptcp_data_frag *dfrag;
struct sock *xmit_ssk;
int len, copied = 0;
bool first = true;

info.flags = 0;
while ((dfrag = mptcp_send_head(sk))) {
Expand All @@ -1537,10 +1521,17 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
while (len > 0) {
int ret = 0;

/* do auto tuning */
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf))
WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
/* the caller already invoked the packet scheduler,
* check for a different subflow usage only after
* spooling the first chunk of data
*/
xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
if (!xmit_ssk)
goto out;
if (xmit_ssk != ssk) {
mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk));
goto out;
}

if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
__mptcp_update_wmem(sk);
Expand All @@ -1560,6 +1551,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
msk->tx_pending_data -= ret;
copied += ret;
len -= ret;
first = false;
}
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
}
Expand All @@ -1579,6 +1571,15 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
}
}

static void mptcp_set_nospace(struct sock *sk)
{
/* enable autotune */
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);

/* will be cleared on avail space */
set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
}

static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
Expand Down Expand Up @@ -1680,7 +1681,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
continue;

wait_for_memory:
set_bit(MPTCP_NOSPACE, &msk->flags);
mptcp_set_nospace(sk);
mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
Expand Down Expand Up @@ -2116,21 +2117,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow)
{
bool dispose_socket = false;
struct socket *sock;

list_del(&subflow->node);

lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);

/* if we are invoked by the msk cleanup code, the subflow is
* already orphaned
*/
sock = ssk->sk_socket;
if (sock) {
dispose_socket = sock != sk->sk_socket;
if (ssk->sk_socket)
sock_orphan(ssk);
}

subflow->disposable = 1;

Expand All @@ -2148,8 +2143,6 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
__sock_put(ssk);
}
release_sock(ssk);
if (dispose_socket)
iput(SOCK_INODE(sock));

sock_put(ssk);
}
Expand Down Expand Up @@ -2536,6 +2529,12 @@ static void __mptcp_destroy_sock(struct sock *sk)

pr_debug("msk=%p", msk);

/* dispose the ancillatory tcp socket, if any */
if (msk->subflow) {
iput(SOCK_INODE(msk->subflow));
msk->subflow = NULL;
}

/* be sure to always acquire the join list lock, to sync vs
* mptcp_finish_join().
*/
Expand Down Expand Up @@ -2586,20 +2585,10 @@ static void mptcp_close(struct sock *sk, long timeout)
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow, dispose_socket;
struct socket *sock;
bool slow = lock_sock_fast(ssk);

slow = lock_sock_fast(ssk);
sock = ssk->sk_socket;
dispose_socket = sock && sock != sk->sk_socket;
sock_orphan(ssk);
unlock_sock_fast(ssk, slow);

/* for the outgoing subflows we additionally need to free
* the associated socket
*/
if (dispose_socket)
iput(SOCK_INODE(sock));
}
sock_orphan(sk);

Expand Down Expand Up @@ -2928,10 +2917,16 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
if (!mptcp_send_head(sk))
return;

if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk);
else
if (!sock_owned_by_user(sk)) {
struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));

if (xmit_ssk == ssk)
__mptcp_subflow_push_pending(sk, ssk);
else if (xmit_ssk)
mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk));
} else {
set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
}
}

#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
Expand Down Expand Up @@ -2979,6 +2974,20 @@ static void mptcp_release_cb(struct sock *sk)
}
}

void mptcp_subflow_process_delegated(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = subflow->conn;

mptcp_data_lock(sk);
if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk);
else
set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
mptcp_data_unlock(sk);
mptcp_subflow_delegated_done(subflow);
}

static int mptcp_hash(struct sock *sk)
{
/* should never be called,
Expand Down Expand Up @@ -3041,7 +3050,7 @@ void mptcp_finish_connect(struct sock *ssk)
mptcp_rcv_space_init(msk, ssk);
}

static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
void mptcp_sock_graft(struct sock *sk, struct socket *parent)
{
write_lock_bh(&sk->sk_callback_lock);
rcu_assign_pointer(sk->sk_wq, &parent->wq);
Expand Down Expand Up @@ -3284,6 +3293,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,

mptcp_copy_inaddrs(newsk, msk->first);
mptcp_rcv_space_init(msk, msk->first);
mptcp_propagate_sndbuf(newsk, msk->first);

/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
Expand Down Expand Up @@ -3324,7 +3334,7 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
if (sk_stream_is_writeable(sk))
return EPOLLOUT | EPOLLWRNORM;

set_bit(MPTCP_NOSPACE, &msk->flags);
mptcp_set_nospace(sk);
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
if (sk_stream_is_writeable(sk))
return EPOLLOUT | EPOLLWRNORM;
Expand Down Expand Up @@ -3388,13 +3398,58 @@ static struct inet_protosw mptcp_protosw = {
.flags = INET_PROTOSW_ICSK,
};

static int mptcp_napi_poll(struct napi_struct *napi, int budget)
{
struct mptcp_delegated_action *delegated;
struct mptcp_subflow_context *subflow;
int work_done = 0;

delegated = container_of(napi, struct mptcp_delegated_action, napi);
while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

bh_lock_sock_nested(ssk);
if (!sock_owned_by_user(ssk) &&
mptcp_subflow_has_delegated_action(subflow))
mptcp_subflow_process_delegated(ssk);
/* ... elsewhere tcp_release_cb_override already processed
* the action or will do at next release_sock().
* In both case must dequeue the subflow here - on the same
* CPU that scheduled it.
*/
bh_unlock_sock(ssk);
sock_put(ssk);

if (++work_done == budget)
return budget;
}

/* always provide a 0 'work_done' argument, so that napi_complete_done
* will not try accessing the NULL napi->dev ptr
*/
napi_complete_done(napi, 0);
return work_done;
}

void __init mptcp_proto_init(void)
{
struct mptcp_delegated_action *delegated;
int cpu;

mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;

if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
panic("Failed to allocate MPTCP pcpu counter\n");

init_dummy_netdev(&mptcp_napi_dev);
for_each_possible_cpu(cpu) {
delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
INIT_LIST_HEAD(&delegated->head);
netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll,
NAPI_POLL_WEIGHT);
napi_enable(&delegated->napi);
}

mptcp_subflow_init();
mptcp_pm_init();
mptcp_token_init();
Expand Down
Loading

0 comments on commit 07fe179

Please sign in to comment.