Skip to content

Commit

Permalink
Merge branch 'rds-tcp-netns'
Browse files Browse the repository at this point in the history
Sowmini Varadhan says:

====================
RDS-TCP: Network namespace support

This patch series contains the set of changes to correctly set up
the infra for PF_RDS sockets that use TCP as the transport in multiple
network namespaces.

Patch 1 in the series is the minimal set of changes to allow
a single instance of RDS-TCP to run in any (i.e init_net or other) net
namespace.  The changes in this patch set ensure that the execution of
'modprobe [-r] rds_tcp' sets up the kernel TCP sockets
relative to the current netns, so that RDS applications can send/recv
packets from that netns, and the netns can later be deleted cleanly.

Patch 2 of the series further allows multiple RDS-TCP instances,
one per network namespace. The changes in this patch allows dynamic
creation/tear-down of RDS-TCP client and server sockets  across all
current and future namespaces.

v2 changes from RFC sent out earlier:
    David Ahern comments in patch 1, net_device notifier in patch 2,
    patch 3 broken off and submitted separately.
v3: Cong Wang review comments.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Aug 7, 2015
2 parents 1ebd08a + 467fa15 commit e03c512
Show file tree
Hide file tree
Showing 13 changed files with 214 additions and 70 deletions.
3 changes: 2 additions & 1 deletion net/rds/bind.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
ret = 0;
goto out;
}
trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
trans = rds_trans_get_preferred(sock_net(sock->sk),
sin->sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
Expand Down
16 changes: 10 additions & 6 deletions net/rds/connection.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn)
* For now they are not garbage collected once they're created. They
* are torn down as the module is removed, if ever.
*/
static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
static struct rds_connection *__rds_conn_create(struct net *net,
__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp,
int is_outgoing)
{
Expand Down Expand Up @@ -157,6 +158,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
conn->c_faddr = faddr;
spin_lock_init(&conn->c_lock);
conn->c_next_tx_seq = 1;
rds_conn_net_set(conn, net);

init_waitqueue_head(&conn->c_waitq);
INIT_LIST_HEAD(&conn->c_send_queue);
Expand All @@ -174,7 +176,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
loop_trans = rds_trans_get_preferred(faddr);
loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
Expand Down Expand Up @@ -260,17 +262,19 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
return conn;
}

struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_connection *rds_conn_create(struct net *net,
__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
return __rds_conn_create(laddr, faddr, trans, gfp, 0);
return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
}
EXPORT_SYMBOL_GPL(rds_conn_create);

struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_connection *rds_conn_create_outgoing(struct net *net,
__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
return __rds_conn_create(laddr, faddr, trans, gfp, 1);
return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);

Expand Down
2 changes: 1 addition & 1 deletion net/rds/ib.c
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
static int rds_ib_laddr_check(__be32 addr)
static int rds_ib_laddr_check(struct net *net, __be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
Expand Down
5 changes: 3 additions & 2 deletions net/rds/ib_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -448,8 +448,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
(unsigned long long)be64_to_cpu(lguid),
(unsigned long long)be64_to_cpu(fguid));

conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
GFP_KERNEL);
/* RDS/IB is not currently netns aware, thus init_net */
conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
&rds_ib_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
Expand Down
2 changes: 1 addition & 1 deletion net/rds/iw.c
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ static void rds_iw_ic_info(struct socket *sock, unsigned int len,
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
static int rds_iw_laddr_check(__be32 addr)
static int rds_iw_laddr_check(struct net *net, __be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
Expand Down
5 changes: 3 additions & 2 deletions net/rds/iw_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,9 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
&dp->dp_saddr, &dp->dp_daddr,
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));

conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
GFP_KERNEL);
/* RDS/IW is not currently netns aware, thus init_net */
conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
&rds_iw_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
Expand Down
23 changes: 19 additions & 4 deletions net/rds/rds.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,21 @@ struct rds_connection {

/* Protocol version */
unsigned int c_version;
possible_net_t c_net;
};

static inline
struct net *rds_conn_net(struct rds_connection *conn)
{
return read_pnet(&conn->c_net);
}

static inline
void rds_conn_net_set(struct rds_connection *conn, struct net *net)
{
write_pnet(&conn->c_net, net);
}

#define RDS_FLAG_CONG_BITMAP 0x01
#define RDS_FLAG_ACK_REQUIRED 0x02
#define RDS_FLAG_RETRANSMITTED 0x04
Expand Down Expand Up @@ -417,7 +430,7 @@ struct rds_transport {
unsigned int t_prefer_loopback:1;
unsigned int t_type;

int (*laddr_check)(__be32 addr);
int (*laddr_check)(struct net *net, __be32 addr);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
int (*conn_connect)(struct rds_connection *conn);
Expand Down Expand Up @@ -608,9 +621,11 @@ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */
int rds_conn_init(void);
void rds_conn_exit(void);
struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_connection *rds_conn_create(struct net *net,
__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_connection *rds_conn_create_outgoing(struct net *net,
__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
void rds_conn_shutdown(struct rds_connection *conn);
void rds_conn_destroy(struct rds_connection *conn);
Expand Down Expand Up @@ -795,7 +810,7 @@ void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
int rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
struct rds_transport *rds_trans_get_preferred(__be32 addr);
struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
Expand Down
3 changes: 2 additions & 1 deletion net/rds/send.c
Original file line number Diff line number Diff line change
Expand Up @@ -1023,7 +1023,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
conn = rs->rs_conn;
else {
conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
conn = rds_conn_create_outgoing(sock_net(sock->sk),
rs->rs_bound_addr, daddr,
rs->rs_transport,
sock->sk->sk_allocation);
if (IS_ERR(conn)) {
Expand Down
165 changes: 147 additions & 18 deletions net/rds/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
#include <linux/in.h>
#include <linux/module.h>
#include <net/tcp.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/tcp.h>

#include "rds.h"
#include "tcp.h"
Expand Down Expand Up @@ -189,9 +192,9 @@ static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
}

static int rds_tcp_laddr_check(__be32 addr)
static int rds_tcp_laddr_check(struct net *net, __be32 addr)
{
if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
if (inet_addr_type(net, addr) == RTN_LOCAL)
return 0;
return -EADDRNOTAVAIL;
}
Expand Down Expand Up @@ -250,16 +253,7 @@ static void rds_tcp_destroy_conns(void)
}
}

static void rds_tcp_exit(void)
{
rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
rds_tcp_listen_stop();
rds_tcp_destroy_conns();
rds_trans_unregister(&rds_tcp_transport);
rds_tcp_recv_exit();
kmem_cache_destroy(rds_tcp_conn_slab);
}
module_exit(rds_tcp_exit);
static void rds_tcp_exit(void);

struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check,
Expand All @@ -281,6 +275,136 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback = 1,
};

static int rds_tcp_netid;

/* per-network namespace private data for this module */
struct rds_tcp_net {
struct socket *rds_tcp_listen_sock;
struct work_struct rds_tcp_accept_w;
};

static void rds_tcp_accept_worker(struct work_struct *work)
{
struct rds_tcp_net *rtn = container_of(work,
struct rds_tcp_net,
rds_tcp_accept_w);

while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
cond_resched();
}

void rds_tcp_accept_work(struct sock *sk)
{
struct net *net = sock_net(sk);
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);

queue_work(rds_wq, &rtn->rds_tcp_accept_w);
}

static __net_init int rds_tcp_init_net(struct net *net)
{
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);

rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
if (!rtn->rds_tcp_listen_sock) {
pr_warn("could not set up listen sock\n");
return -EAFNOSUPPORT;
}
INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
return 0;
}

static void __net_exit rds_tcp_exit_net(struct net *net)
{
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);

/* If rds_tcp_exit_net() is called as a result of netns deletion,
* the rds_tcp_kill_sock() device notifier would already have cleaned
* up the listen socket, thus there is no work to do in this function.
*
* If rds_tcp_exit_net() is called as a result of module unload,
* i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then
* we do need to clean up the listen socket here.
*/
if (rtn->rds_tcp_listen_sock) {
rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
rtn->rds_tcp_listen_sock = NULL;
flush_work(&rtn->rds_tcp_accept_w);
}
}

static struct pernet_operations rds_tcp_net_ops = {
.init = rds_tcp_init_net,
.exit = rds_tcp_exit_net,
.id = &rds_tcp_netid,
.size = sizeof(struct rds_tcp_net),
};

static void rds_tcp_kill_sock(struct net *net)
{
struct rds_tcp_connection *tc, *_tc;
struct sock *sk;
LIST_HEAD(tmp_list);
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);

rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
rtn->rds_tcp_listen_sock = NULL;
flush_work(&rtn->rds_tcp_accept_w);
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
struct net *c_net = read_pnet(&tc->conn->c_net);

if (net != c_net || !tc->t_sock)
continue;
list_move_tail(&tc->t_tcp_node, &tmp_list);
}
spin_unlock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
sk = tc->t_sock->sk;
sk->sk_prot->disconnect(sk, 0);
tcp_done(sk);
if (tc->conn->c_passive)
rds_conn_destroy(tc->conn->c_passive);
rds_conn_destroy(tc->conn);
}
}

static int rds_tcp_dev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);

/* rds-tcp registers as a pernet subys, so the ->exit will only
* get invoked after network acitivity has quiesced. We need to
* clean up all sockets to quiesce network activity, and use
* the unregistration of the per-net loopback device as a trigger
* to start that cleanup.
*/
if (event == NETDEV_UNREGISTER_FINAL &&
dev->ifindex == LOOPBACK_IFINDEX)
rds_tcp_kill_sock(dev_net(dev));

return NOTIFY_DONE;
}

static struct notifier_block rds_tcp_dev_notifier = {
.notifier_call = rds_tcp_dev_event,
.priority = -10, /* must be called after other network notifiers */
};

static void rds_tcp_exit(void)
{
rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
unregister_pernet_subsys(&rds_tcp_net_ops);
if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
pr_warn("could not unregister rds_tcp_dev_notifier\n");
rds_tcp_destroy_conns();
rds_trans_unregister(&rds_tcp_transport);
rds_tcp_recv_exit();
kmem_cache_destroy(rds_tcp_conn_slab);
}
module_exit(rds_tcp_exit);

static int rds_tcp_init(void)
{
int ret;
Expand All @@ -293,6 +417,16 @@ static int rds_tcp_init(void)
goto out;
}

ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
if (ret) {
pr_warn("could not register rds_tcp_dev_notifier\n");
goto out;
}

ret = register_pernet_subsys(&rds_tcp_net_ops);
if (ret)
goto out_slab;

ret = rds_tcp_recv_init();
if (ret)
goto out_slab;
Expand All @@ -301,19 +435,14 @@ static int rds_tcp_init(void)
if (ret)
goto out_recv;

ret = rds_tcp_listen_init();
if (ret)
goto out_register;

rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);

goto out;

out_register:
rds_trans_unregister(&rds_tcp_transport);
out_recv:
rds_tcp_recv_exit();
out_slab:
unregister_pernet_subsys(&rds_tcp_net_ops);
kmem_cache_destroy(rds_tcp_conn_slab);
out:
return ret;
Expand Down
Loading

0 comments on commit e03c512

Please sign in to comment.