Skip to content

Commit

Permalink
Merge branch 'rds'
Browse files Browse the repository at this point in the history
Sowmini Varadhan says:

====================
net/rds: RDS-TCP robustness fixes

This patch-set contains bug fixes for state-recovery at the RDS
layer when the underlying transport is TCP and the TCP state at one
of the endpoints is reset

V2 changes: DaveM comments to reduce memory footprint, follow
            NFS/RPC model where possible. Added test-case #3

Without the changes in this set, when one of the endpoints is reset,
the existing code does not correctly clean up RDS socket state for stale
connections, resulting in some unstable, timing-dependant behavior on
the wire, including an infinite exchange of 3WHs back-and-forth, and a
resulting potential to never converge RDS state.

Test cases used to verify the changes in this set are:

1. Start rds client/server applications on two participating nodes,
   node1 and node2. After at least one packet has been sent (to establish
   the TCP connection), restart the rds_tcp module on the client, and
   now resend packets. Tcpdump should show server sending a FIN for the
   "old" client port, and clean connection establishment/exchange for
   the new client port.

2. At the end of step 1, restart rds srever on node2, and start client on
   node1, make sure using tcpdump, 'netstat -an|grep 16385' that
   packets flow correctly.

3. start RDS client/server application on two participating nodes, and
   repeat steps 1 and 2, but this time, simulate node failure by doing
   "ifconfig <intf> down", so no FIN is sent.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed May 9, 2015
2 parents e16e888 + c82ac7e commit d3196a2
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 2 deletions.
17 changes: 15 additions & 2 deletions net/rds/connection.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *loop_trans;
unsigned long flags;
int ret;
struct rds_transport *otrans = trans;

if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
goto new_conn;
rcu_read_lock();
conn = rds_conn_lookup(head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
Expand All @@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
if (conn)
goto out;

new_conn:
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (!conn) {
conn = ERR_PTR(-ENOMEM);
Expand Down Expand Up @@ -230,13 +234,22 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
/* Creating normal conn */
struct rds_connection *found;

found = rds_conn_lookup(head, laddr, faddr, trans);
if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
found = NULL;
else
found = rds_conn_lookup(head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
hlist_add_head_rcu(&conn->c_hash_node, head);
if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
(otrans->t_type != RDS_TRANS_TCP)) {
/* Only the active side should be added to
* reconnect list for TCP.
*/
hlist_add_head_rcu(&conn->c_hash_node, head);
}
rds_cong_add_conn(conn);
rds_conn_count++;
}
Expand Down
1 change: 1 addition & 0 deletions net/rds/tcp_connect.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_ESTABLISHED:
rds_connect_complete(conn);
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSE:
rds_conn_drop(conn);
default:
Expand Down
46 changes: 46 additions & 0 deletions net/rds/tcp_listen.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work);
static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
static struct socket *rds_tcp_listen_sock;

static int rds_tcp_keepalive(struct socket *sock)
{
/* values below based on xs_udp_default_timeout */
int keepidle = 5; /* send a probe 'keepidle' secs after last data */
int keepcnt = 5; /* number of unack'ed probes before declaring dead */
int keepalive = 1;
int ret = 0;

ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
(char *)&keepalive, sizeof(keepalive));
if (ret < 0)
goto bail;

ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
(char *)&keepcnt, sizeof(keepcnt));
if (ret < 0)
goto bail;

ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
(char *)&keepidle, sizeof(keepidle));
if (ret < 0)
goto bail;

/* KEEPINTVL is the interval between successive probes. We follow
* the model in xs_tcp_finish_connecting() and re-use keepidle.
*/
ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
(char *)&keepidle, sizeof(keepidle));
bail:
return ret;
}

static int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
struct rds_connection *conn;
int ret;
struct inet_sock *inet;
struct rds_tcp_connection *rs_tcp;

ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
Expand All @@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock)
if (ret < 0)
goto out;

ret = rds_tcp_keepalive(new_sock);
if (ret < 0)
goto out;

rds_tcp_tune(new_sock);

inet = inet_sk(new_sock->sk);
Expand All @@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock)
ret = PTR_ERR(conn);
goto out;
}
/* An incoming SYN request came in, and TCP just accepted it.
* We always create a new conn for listen side of TCP, and do not
* add it to the c_hash_list.
*
* If the client reboots, this conn will need to be cleaned up.
* rds_tcp_state_change() will do that cleanup
*/
rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
WARN_ON(!rs_tcp || rs_tcp->t_sock);

/*
* see the comment above rds_queue_delayed_reconnect()
Expand Down

0 comments on commit d3196a2

Please sign in to comment.