From 2d746c93b6e55d34a98c8983b30d991707a2059b Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Nov 2017 11:11:27 -0800 Subject: [PATCH 1/3] rds: tcp: remove redundant function rds_tcp_conn_paths_destroy() A side-effect of Commit c14b0366813a ("rds: tcp: set linger to 1 when unloading a rds-tcp") is that we always send a RST on the tcp connection for rds_conn_destroy(), so rds_tcp_conn_paths_destroy() is not needed any more and is removed in this patch. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/tcp.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 6b7ee71f40c63..222cc530e5b5d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -495,27 +495,6 @@ static struct pernet_operations rds_tcp_net_ops = { .size = sizeof(struct rds_tcp_net), }; -/* explicitly send a RST on each socket, thereby releasing any socket refcnts - * that may otherwise hold up netns deletion. - */ -static void rds_tcp_conn_paths_destroy(struct rds_connection *conn) -{ - struct rds_conn_path *cp; - struct rds_tcp_connection *tc; - int i; - struct sock *sk; - - for (i = 0; i < RDS_MPATH_WORKERS; i++) { - cp = &conn->c_path[i]; - tc = cp->cp_transport_data; - if (!tc->t_sock) - continue; - sk = tc->t_sock->sk; - sk->sk_prot->disconnect(sk, 0); - tcp_done(sk); - } -} - static void rds_tcp_kill_sock(struct net *net) { struct rds_tcp_connection *tc, *_tc; @@ -535,10 +514,8 @@ static void rds_tcp_kill_sock(struct net *net) list_move_tail(&tc->t_tcp_node, &tmp_list); } spin_unlock_irq(&rds_tcp_conn_lock); - list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { - rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn); + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) rds_conn_destroy(tc->t_cpath->cp_conn); - } } void *rds_tcp_listen_sock_def_readable(struct net *net) From 681648e67d43cf269c5590ecf021ed481f4551fc Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Nov 2017 11:11:28 -0800 Subject: [PATCH 2/3] rds: tcp: correctly sequence cleanup on netns deletion. Commit 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net") introduces a regression in rds-tcp netns cleanup. The cleanup_net(), (and thus rds_tcp_dev_event notification) is only called from put_net() when all netns refcounts go to 0, but this cannot happen if the rds_connection itself is holding a c_net ref that it expects to release in rds_tcp_kill_sock. Instead, the rds_tcp_kill_sock callback should make sure to tear down state carefully, ensuring that the socket teardown is only done after all data-structures and workqs that depend on it are quiesced. The original motivation for commit 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net") was to resolve a race condition reported by syzkaller where workqs for tx/rx/connect were triggered after the namespace was deleted. Those worker threads should have been cancelled/flushed before socket tear-down and indeed, rds_conn_path_destroy() does try to sequence this by doing /* cancel cp_send_w */ /* cancel cp_recv_w */ /* flush cp_down_w */ /* free data structures */ Here the "flush cp_down_w" will trigger rds_conn_shutdown and thus invoke rds_tcp_conn_path_shutdown() to close the tcp socket, so that we ought to have satisfied the requirement that "socket-close is done after all other dependent state is quiesced". However, rds_conn_shutdown has a bug in that it *always* triggers the reconnect workq (and if connection is successful, we always restart tx/rx workqs so with the right timing, we risk the race conditions reported by syzkaller). Netns deletion is like module teardown- no need to restart a reconnect in this case. We can use the c_destroy_in_prog bit to avoid restarting the reconnect. Fixes: 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net") Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/connection.c | 3 ++- net/rds/rds.h | 6 +++--- net/rds/tcp.c | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index 7ee2d5d68b781..9efc82c665b5a 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -366,6 +366,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp) * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&cp->cp_conn_w); + if (conn->c_destroy_in_prog) + return; rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); @@ -445,7 +447,6 @@ void rds_conn_destroy(struct rds_connection *conn) */ rds_cong_remove_conn(conn); - put_net(conn->c_net); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); diff --git a/net/rds/rds.h b/net/rds/rds.h index c349c71babff4..d09f6c1facb4a 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -150,7 +150,7 @@ struct rds_connection { /* Protocol version */ unsigned int c_version; - struct net *c_net; + possible_net_t c_net; struct list_head c_map_item; unsigned long c_map_queued; @@ -165,13 +165,13 @@ struct rds_connection { static inline struct net *rds_conn_net(struct rds_connection *conn) { - return conn->c_net; + return read_pnet(&conn->c_net); } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { - conn->c_net = get_net(net); + write_pnet(&conn->c_net, net); } #define RDS_FLAG_CONG_BITMAP 0x01 diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 222cc530e5b5d..f580f72ae69ec 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -506,7 +506,7 @@ static void rds_tcp_kill_sock(struct net *net) rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = tc->t_cpath->cp_conn->c_net; + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; @@ -563,7 +563,7 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = tc->t_cpath->cp_conn->c_net; + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; From f10b4cff98c6977668434fbf5dd58695eeca2897 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Nov 2017 11:11:29 -0800 Subject: [PATCH 3/3] rds: tcp: atomically purge entries from rds_tcp_conn_list during netns delete The rds_tcp_kill_sock() function parses the rds_tcp_conn_list to find the rds_connection entries marked for deletion as part of the netns deletion under the protection of the rds_tcp_conn_lock. Since the rds_tcp_conn_list tracks rds_tcp_connections (which have a 1:1 mapping with rds_conn_path), multiple tc entries in the rds_tcp_conn_list will map to a single rds_connection, and will be deleted as part of the rds_conn_destroy() operation that is done outside the rds_tcp_conn_lock. The rds_tcp_conn_list traversal done under the protection of rds_tcp_conn_lock should not leave any doomed tc entries in the list after the rds_tcp_conn_lock is released, else another concurrently executiong netns delete (for a differnt netns) thread may trip on these entries. Reported-by: syzbot Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/tcp.c | 9 +++++++-- net/rds/tcp.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index f580f72ae69ec..39f502d47969d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -306,7 +306,8 @@ static void rds_tcp_conn_free(void *arg) rdsdebug("freeing tc %p\n", tc); spin_lock_irqsave(&rds_tcp_conn_lock, flags); - list_del(&tc->t_tcp_node); + if (!tc->t_tcp_node_detached) + list_del(&tc->t_tcp_node); spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); kmem_cache_free(rds_tcp_conn_slab, tc); @@ -510,8 +511,12 @@ static void rds_tcp_kill_sock(struct net *net) if (net != c_net || !tc->t_sock) continue; - if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) + if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) { list_move_tail(&tc->t_tcp_node, &tmp_list); + } else { + list_del(&tc->t_tcp_node); + tc->t_tcp_node_detached = true; + } } spin_unlock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 1aafbf7c30114..e7858ee8ed8b7 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -12,6 +12,7 @@ struct rds_tcp_incoming { struct rds_tcp_connection { struct list_head t_tcp_node; + bool t_tcp_node_detached; struct rds_conn_path *t_cpath; /* t_conn_path_lock synchronizes the connection establishment between * rds_tcp_accept_one and rds_tcp_conn_path_connect