diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4470501374bf9..20aa4175b9f8e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -391,6 +391,9 @@ static int smc_connect_rdma(struct smc_sock *smc) sock_hold(&smc->sk); /* sock put in passive closing */ + if (smc->use_fallback) + goto out_connected; + if (!tcp_sk(smc->clcsock->sk)->syn_smc) { /* peer has not signalled SMC-capability */ smc->use_fallback = true; @@ -790,6 +793,9 @@ static void smc_listen_work(struct work_struct *work) int rc = 0; u8 ibport; + if (new_smc->use_fallback) + goto out_connected; + /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { new_smc->use_fallback = true; @@ -968,7 +974,7 @@ static void smc_tcp_listen_work(struct work_struct *work) continue; new_smc->listen_smc = lsmc; - new_smc->use_fallback = false; /* assume rdma capability first*/ + new_smc->use_fallback = lsmc->use_fallback; sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); @@ -1004,7 +1010,8 @@ static int smc_listen(struct socket *sock, int backlog) * them to the clc socket -- copy smc socket options to clc socket */ smc_copy_sock_settings_to_clc(smc); - tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (!smc->use_fallback) + tcp_sk(smc->clcsock->sk)->syn_smc = 1; rc = kernel_listen(smc->clcsock, backlog); if (rc) @@ -1037,6 +1044,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (lsmc->sk.sk_state != SMC_LISTEN) { rc = -EINVAL; + release_sock(sk); goto out; } @@ -1064,9 +1072,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (!rc) rc = sock_error(nsk); + release_sock(sk); + if (rc) + goto out; + + if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { + /* wait till data arrives on the socket */ + timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * + MSEC_PER_SEC); + if (smc_sk(nsk)->use_fallback) { + struct sock *clcsk = smc_sk(nsk)->clcsock->sk; + + lock_sock(clcsk); + if (skb_queue_empty(&clcsk->sk_receive_queue)) + sk_wait_data(clcsk, &timeo, NULL); + release_sock(clcsk); + } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { + lock_sock(nsk); + smc_rx_wait_data(smc_sk(nsk), &timeo); + release_sock(nsk); + } + } out: - release_sock(sk); sock_put(sk); /* sock_hold above */ return rc; } @@ -1097,6 +1125,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_INIT)) goto out; + + if (msg->msg_flags & MSG_FASTOPEN) { + if (sk->sk_state == SMC_INIT) { + smc->use_fallback = true; + } else { + rc = -EINVAL; + goto out; + } + } + if (smc->use_fallback) rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); else @@ -1274,14 +1312,64 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct smc_sock *smc; + int val, rc; smc = smc_sk(sk); /* generic setsockopts reaching us here always apply to the * CLC socket */ - return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, - optval, optlen); + rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); + if (smc->clcsock->sk->sk_err) { + sk->sk_err = smc->clcsock->sk->sk_err; + sk->sk_error_report(sk); + } + if (rc) + return rc; + + if (optlen < sizeof(int)) + return rc; + get_user(val, (int __user *)optval); + + lock_sock(sk); + switch (optname) { + case TCP_ULP: + case TCP_FASTOPEN: + case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: + case TCP_FASTOPEN_NO_COOKIE: + /* option not supported by SMC */ + if (sk->sk_state == SMC_INIT) { + smc->use_fallback = true; + } else { + if (!smc->use_fallback) + rc = -EINVAL; + } + break; + case TCP_NODELAY: + if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (val) + mod_delayed_work(system_wq, &smc->conn.tx_work, + 0); + } + break; + case TCP_CORK: + if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (!val) + mod_delayed_work(system_wq, &smc->conn.tx_work, + 0); + } + break; + case TCP_DEFER_ACCEPT: + smc->sockopt_defer_accept = val; + break; + default: + break; + } + release_sock(sk); + + return rc; } static int smc_getsockopt(struct socket *sock, int level, int optname, diff --git a/net/smc/smc.h b/net/smc/smc.h index e4829a2f46baf..2405e889b93d1 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -180,6 +180,10 @@ struct smc_sock { /* smc sock container */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool use_fallback; /* fallback to tcp */ + int sockopt_defer_accept; + /* sockopt TCP_DEFER_ACCEPT + * value + */ u8 wait_close_tx_prepared : 1; /* shutdown wr or close * started, waiting for unsent diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index b42395d24cba5..42ad57365eca5 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn, sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); BUILD_BUG_ON_MSG( - offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, + sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE, "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); BUILD_BUG_ON_MSG( sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index ab240b37ad114..d2012fd221001 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -48,7 +48,7 @@ struct smc_cdc_msg { struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; u8 reserved[18]; -} __aligned(8); +} __packed; /* format defined in RFC7609 */ static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) { diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index eff4e0d0bb310..af851d8df1f8e 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -51,7 +51,7 @@ static void smc_rx_data_ready(struct sock *sk) * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). */ -static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) +int smc_rx_wait_data(struct smc_sock *smc, long *timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h index 3a32b59bf06c5..0b75a6b470e61 100644 --- a/net/smc/smc_rx.h +++ b/net/smc/smc_rx.h @@ -20,5 +20,6 @@ void smc_rx_init(struct smc_sock *smc); int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, int flags); +int smc_rx_wait_data(struct smc_sock *smc, long *timeo); #endif /* SMC_RX_H */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 72f004c9c9b13..58dfe0bd9d607 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -19,6 +19,7 @@ #include #include +#include #include "smc.h" #include "smc_wr.h" @@ -26,6 +27,7 @@ #include "smc_tx.h" #define SMC_TX_WORK_DELAY HZ +#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -115,6 +117,13 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags) return rc; } +static bool smc_tx_is_corked(struct smc_sock *smc) +{ + struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); + + return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -209,7 +218,16 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) /* since we just produced more new data into sndbuf, * trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - smc_tx_sndbuf_nonempty(conn); + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && + (atomic_read(&conn->sndbuf_space) > + (conn->sndbuf_size >> 1))) + /* for a corked socket defer the RDMA writes if there + * is still sufficient sndbuf_space available + */ + schedule_delayed_work(&conn->tx_work, + SMC_TX_CORK_DELAY); + else + smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ return send_done; @@ -409,8 +427,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) } rc = 0; if (conn->alert_token_local) /* connection healthy */ - schedule_delayed_work(&conn->tx_work, - SMC_TX_WORK_DELAY); + mod_delayed_work(system_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; }