From cdc306a5c9cd3607db5d018c6320cdd923c04373 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 13 Oct 2018 20:34:42 +0800 Subject: [PATCH 1/5] rds: make v3.1 as compat version Mark RDSv3.1 as compat version and add v4.1 version macro's. Subsequent patches enable TOS(Type of Service) feature which is tied with v4.1 for RDMA transport. Reviewed-by: Sowmini Varadhan Signed-off-by: Santosh Shilimkar [yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes] Signed-off-by: Zhu Yanjun --- net/rds/connection.c | 1 + net/rds/ib_cm.c | 40 +++++++++++++++++++++++----------------- net/rds/rds.h | 4 ++++ net/rds/threads.c | 1 + 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index 3bd2f4a5a30d9..1ab14b68ecc8c 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -139,6 +139,7 @@ static void __rds_conn_path_init(struct rds_connection *conn, atomic_set(&cp->cp_state, RDS_CONN_DOWN); cp->cp_send_gen = 0; cp->cp_reconnect_jiffies = 0; + cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index bfbb31f0c7fd9..0eeae0910f06f 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -133,23 +133,24 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_ib_set_flow_control(conn, be32_to_cpu(credit)); } - if (conn->c_version < RDS_PROTOCOL(3, 1)) { - pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n", - &conn->c_laddr, &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version)); - set_bit(RDS_DESTROY_PENDING, &conn->c_path[0].cp_flags); - rds_conn_destroy(conn); - return; - } else { - pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n", - ic->i_active_side ? "Active" : "Passive", - &conn->c_laddr, &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + if (conn->c_version < RDS_PROTOCOL_VERSION) { + if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) { + pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rds_conn_destroy(conn); + return; + } } + pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n", + ic->i_active_side ? "Active" : "Passive", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + atomic_set(&ic->i_cq_quiesce, 0); /* Init rings and fill recv. this needs to wait until protocol @@ -184,6 +185,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even NULL); } + conn->c_proposed_version = conn->c_version; rds_connect_complete(conn); } @@ -667,6 +669,9 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) version = RDS_PROTOCOL_3_0; while ((common >>= 1) != 0) version++; + } else if (RDS_PROTOCOL_COMPAT_VERSION == + RDS_PROTOCOL(major, minor)) { + version = RDS_PROTOCOL_COMPAT_VERSION; } else { if (isv6) printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n", @@ -861,7 +866,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) /* If the peer doesn't do protocol negotiation, we must * default to RDSv3.0 */ - rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); + rds_ib_set_protocol(conn, RDS_PROTOCOL_VERSION); ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ ret = rds_ib_setup_qp(conn); @@ -870,7 +875,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, + conn->c_proposed_version, UINT_MAX, UINT_MAX, isv6); ret = rdma_connect(cm_id, &conn_param); if (ret) diff --git a/net/rds/rds.h b/net/rds/rds.h index 4ffe100ff5e69..660023f08553b 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -19,10 +19,13 @@ */ #define RDS_PROTOCOL_3_0 0x0300 #define RDS_PROTOCOL_3_1 0x0301 +#define RDS_PROTOCOL_4_0 0x0400 +#define RDS_PROTOCOL_4_1 0x0401 #define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 #define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) +#define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1 /* The following ports, 16385, 18634, 18635, are registered with IANA as * the ports to be used for RDS over TCP and UDP. Currently, only RDS over @@ -151,6 +154,7 @@ struct rds_connection { struct rds_cong_map *c_fcong; /* Protocol version */ + unsigned int c_proposed_version; unsigned int c_version; possible_net_t c_net; diff --git a/net/rds/threads.c b/net/rds/threads.c index e64f9e4c3cdaf..32dc50f0a3031 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -93,6 +93,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); } rcu_read_unlock(); + cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; } EXPORT_SYMBOL_GPL(rds_connect_path_complete); From d021fabf525ffdaeb4e6f1cf50e1ba325ca5273b Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 23 Oct 2018 23:09:00 -0400 Subject: [PATCH 2/5] rds: rdma: add consumer reject For legacy protocol version incompatibility with non linux RDS, consumer reject reason being used to convey it to peer. But the choice of reject reason value as '1' was really poor. Anyway for interoperability reasons with shipping products, it needs to be supported. For any future versions, properly encoded reject reason should to be used. Reviewed-by: Sowmini Varadhan Signed-off-by: Santosh Shilimkar [yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes] Signed-off-by: Zhu Yanjun --- net/rds/ib_cm.c | 6 ++++-- net/rds/rdma_transport.c | 12 ++++++++++++ net/rds/rdma_transport.h | 6 ++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 0eeae0910f06f..a1c3ad380ec81 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -734,8 +734,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* Check whether the remote protocol version matches ours. */ version = rds_ib_protocol_compatible(event, isv6); - if (!version) + if (!version) { + err = RDS_RDMA_REJ_INCOMPAT; goto out; + } dp = event->param.conn.private_data; if (isv6) { @@ -851,7 +853,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, if (conn) mutex_unlock(&conn->c_cm_lock); if (err) - rdma_reject(cm_id, NULL, 0); + rdma_reject(cm_id, &err, sizeof(int)); return destroy; } diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 6b0f57c83a2ad..63cbc6b8560ca 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -51,6 +51,8 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, struct rds_connection *conn = cm_id->context; struct rds_transport *trans; int ret = 0; + int *err; + u8 len; rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, event->event, rdma_event_msg(event->event)); @@ -106,8 +108,18 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_REJECTED: + if (!conn) + break; + err = (int *)rdma_consumer_reject_data(cm_id, event, &len); + if (!err || (err && ((*err) == RDS_RDMA_REJ_INCOMPAT))) { + pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n", + &conn->c_laddr, &conn->c_faddr); + conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION; + rds_conn_drop(conn); + } rdsdebug("Connection rejected: %s\n", rdma_reject_msg(cm_id, event->status)); + break; /* FALLTHROUGH */ case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index 200d3134aaae1..bfafd4a6d8271 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -11,6 +11,12 @@ #define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 +/* Below reject reason is for legacy interoperability issue with non-linux + * RDS endpoints where older version incompatibility is conveyed via value 1. + * For future version(s), proper encoded reject reason should be be used. + */ +#define RDS_RDMA_REJ_INCOMPAT 1 + int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); From 3eb450367d0823226515ee24712ed08eccb33eb9 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 23 Oct 2018 23:21:14 -0400 Subject: [PATCH 3/5] rds: add type of service(tos) infrastructure RDS Service type (TOS) is user-defined and needs to be configured via RDS IOCTL interface. It must be set before initiating any traffic and once set the TOS can not be changed. All out-going traffic from the socket will be associated with its TOS. Reviewed-by: Sowmini Varadhan Signed-off-by: Santosh Shilimkar [yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes] Signed-off-by: Zhu Yanjun --- include/uapi/linux/rds.h | 11 +++++++++++ net/rds/af_rds.c | 35 ++++++++++++++++++++++++++++++++++- net/rds/connection.c | 20 +++++++++++--------- net/rds/ib.c | 1 + net/rds/ib_cm.c | 2 +- net/rds/rdma_transport.c | 1 + net/rds/rds.h | 9 +++++++-- net/rds/recv.c | 1 + net/rds/send.c | 6 +++--- net/rds/tcp.c | 1 + net/rds/tcp_listen.c | 2 +- 11 files changed, 72 insertions(+), 17 deletions(-) diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 8b73cb603c5f3..5d0f76c780e5f 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -69,6 +69,12 @@ #define RDS_TRANS_COUNT 3 #define RDS_TRANS_NONE (~0) +/* IOCTLS commands for SOL_RDS */ +#define SIOCRDSSETTOS (SIOCPROTOPRIVATE) +#define SIOCRDSGETTOS (SIOCPROTOPRIVATE + 1) + +typedef __u8 rds_tos_t; + /* * Control message types for SOL_RDS. * @@ -149,6 +155,7 @@ struct rds_info_connection { __be32 faddr; __u8 transport[TRANSNAMSIZ]; /* null term ascii */ __u8 flags; + __u8 tos; } __attribute__((packed)); struct rds6_info_connection { @@ -171,6 +178,7 @@ struct rds_info_message { __be16 lport; __be16 fport; __u8 flags; + __u8 tos; } __attribute__((packed)); struct rds6_info_message { @@ -214,6 +222,7 @@ struct rds_info_tcp_socket { __u32 last_sent_nxt; __u32 last_expected_una; __u32 last_seen_una; + __u8 tos; } __attribute__((packed)); struct rds6_info_tcp_socket { @@ -240,6 +249,7 @@ struct rds_info_rdma_connection { __u32 max_send_sge; __u32 rdma_mr_max; __u32 rdma_mr_size; + __u8 tos; }; struct rds6_info_rdma_connection { @@ -253,6 +263,7 @@ struct rds6_info_rdma_connection { __u32 max_send_sge; __u32 rdma_mr_max; __u32 rdma_mr_size; + __u8 tos; }; /* RDS message Receive Path Latency points */ diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 65571a6273c32..9045158580370 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -254,7 +254,38 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - return -ENOIOCTLCMD; + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + rds_tos_t tos; + + switch (cmd) { + case SIOCRDSSETTOS: + if (get_user(tos, (rds_tos_t __user *)arg)) + return -EFAULT; + + if (rs->rs_transport && + rs->rs_transport->t_type == RDS_TRANS_TCP) + tos = 0; + + spin_lock_bh(&rds_sock_lock); + if (rs->rs_tos || rs->rs_conn) { + spin_unlock_bh(&rds_sock_lock); + return -EINVAL; + } + rs->rs_tos = tos; + spin_unlock_bh(&rds_sock_lock); + break; + case SIOCRDSGETTOS: + spin_lock_bh(&rds_sock_lock); + tos = rs->rs_tos; + spin_unlock_bh(&rds_sock_lock); + if (put_user(tos, (rds_tos_t __user *)arg)) + return -EFAULT; + break; + default: + return -ENOIOCTLCMD; + } + + return 0; } static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, @@ -650,6 +681,8 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; rs->rs_rx_traces = 0; + rs->rs_tos = 0; + rs->rs_conn = NULL; spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); diff --git a/net/rds/connection.c b/net/rds/connection.c index 1ab14b68ecc8c..7ea134f9a825e 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -84,7 +84,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, - int dev_if) + u8 tos, int dev_if) { struct rds_connection *conn, *ret = NULL; @@ -92,6 +92,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net, if (ipv6_addr_equal(&conn->c_faddr, faddr) && ipv6_addr_equal(&conn->c_laddr, laddr) && conn->c_trans == trans && + conn->c_tos == tos && net == rds_conn_net(conn) && conn->c_dev_if == dev_if) { ret = conn; @@ -160,7 +161,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, - gfp_t gfp, + gfp_t gfp, u8 tos, int is_outgoing, int dev_if) { @@ -172,7 +173,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rcu_read_lock(); - conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if); + conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && @@ -206,6 +207,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; + conn->c_tos = tos; #if IS_ENABLED(CONFIG_IPV6) /* If the local address is link local, set c_bound_if to be the @@ -298,7 +300,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct rds_connection *found; found = rds_conn_lookup(net, head, laddr, faddr, trans, - dev_if); + tos, dev_if); if (found) { struct rds_conn_path *cp; int i; @@ -333,10 +335,10 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, - struct rds_transport *trans, gfp_t gfp, - int dev_if) + struct rds_transport *trans, u8 tos, + gfp_t gfp, int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if); + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); @@ -344,9 +346,9 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, - gfp_t gfp, int dev_if) + u8 tos, gfp_t gfp, int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if); + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); diff --git a/net/rds/ib.c b/net/rds/ib.c index 9d7b7586f2406..21b6588b71caa 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -301,6 +301,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; + iinfo->tos = conn->c_tos; memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index a1c3ad380ec81..70518e329a9e5 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -786,7 +786,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* RDS/IB is not currently netns aware, thus init_net */ conn = rds_conn_create(&init_net, daddr6, saddr6, - &rds_ib_transport, GFP_KERNEL, ifindex); + &rds_ib_transport, 0, GFP_KERNEL, ifindex); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 63cbc6b8560ca..e37f91537d297 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -115,6 +115,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n", &conn->c_laddr, &conn->c_faddr); conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION; + conn->c_tos = 0; rds_conn_drop(conn); } rdsdebug("Connection rejected: %s\n", diff --git a/net/rds/rds.h b/net/rds/rds.h index 660023f08553b..7e52b92092d71 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -158,6 +158,9 @@ struct rds_connection { unsigned int c_version; possible_net_t c_net; + /* TOS */ + u8 c_tos; + struct list_head c_map_item; unsigned long c_map_queued; @@ -652,6 +655,7 @@ struct rds_sock { u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; struct rds_msg_zcopy_queue rs_zcookie_queue; + u8 rs_tos; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) @@ -760,13 +764,14 @@ void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, - struct rds_transport *trans, gfp_t gfp, + struct rds_transport *trans, + u8 tos, gfp_t gfp, int dev_if); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, - gfp_t gfp, int dev_if); + u8 tos, gfp_t gfp, int dev_if); void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); diff --git a/net/rds/recv.c b/net/rds/recv.c index 6bb6b16ca270a..853de48760880 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -782,6 +782,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo.len = be32_to_cpu(inc->i_hdr.h_len); + minfo.tos = inc->i_conn->c_tos; if (flip) { minfo.laddr = daddr; diff --git a/net/rds/send.c b/net/rds/send.c index fd8b687d5c05e..c555e121b908b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1277,12 +1277,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ - if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) + if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) { conn = rs->rs_conn; - else { + } else { conn = rds_conn_create_outgoing(sock_net(sock->sk), &rs->rs_bound_addr, &daddr, - rs->rs_transport, + rs->rs_transport, 0, sock->sk->sk_allocation, scope_id); if (IS_ERR(conn)) { diff --git a/net/rds/tcp.c b/net/rds/tcp.c index c16f0a362c32c..eb6851952cbf2 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -267,6 +267,7 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, tsinfo.last_sent_nxt = tc->t_last_sent_nxt; tsinfo.last_expected_una = tc->t_last_expected_una; tsinfo.last_seen_una = tc->t_last_seen_una; + tsinfo.tos = tc->t_cpath->cp_conn->c_tos; rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); } diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index c12203f646da9..810a3a49e9474 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -200,7 +200,7 @@ int rds_tcp_accept_one(struct socket *sock) conn = rds_conn_create(sock_net(sock->sk), my_addr, peer_addr, - &rds_tcp_transport, GFP_KERNEL, dev_if); + &rds_tcp_transport, 0, GFP_KERNEL, dev_if); if (IS_ERR(conn)) { ret = PTR_ERR(conn); From 56dc8bce9f761cf61258e25d96dec4072273d8db Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 13 Oct 2018 21:36:49 +0800 Subject: [PATCH 4/5] rds: add transport specific tos_map hook RDMA transport maps user tos to underline virtual lanes(VL) for IB or DSCP values. RDMA CM transport abstract thats for RDS. TCP transport makes use of default priority 0 and maps all user tos values to it. Reviewed-by: Sowmini Varadhan Signed-off-by: Santosh Shilimkar [yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes] Signed-off-by: Zhu Yanjun --- net/rds/af_rds.c | 10 ++++++---- net/rds/ib.c | 10 ++++++++++ net/rds/rds.h | 1 + net/rds/tcp.c | 7 +++++++ 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 9045158580370..d6cc97fbbbb02 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -255,16 +255,18 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); - rds_tos_t tos; + rds_tos_t utos, tos = 0; switch (cmd) { case SIOCRDSSETTOS: - if (get_user(tos, (rds_tos_t __user *)arg)) + if (get_user(utos, (rds_tos_t __user *)arg)) return -EFAULT; if (rs->rs_transport && - rs->rs_transport->t_type == RDS_TRANS_TCP) - tos = 0; + rs->rs_transport->get_tos_map) + tos = rs->rs_transport->get_tos_map(utos); + else + return -ENOIOCTLCMD; spin_lock_bh(&rds_sock_lock); if (rs->rs_tos || rs->rs_conn) { diff --git a/net/rds/ib.c b/net/rds/ib.c index 21b6588b71caa..2da9b75bad16a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -515,6 +515,15 @@ void rds_ib_exit(void) rds_ib_mr_exit(); } +static u8 rds_ib_get_tos_map(u8 tos) +{ + /* 1:1 user to transport map for RDMA transport. + * In future, if custom map is desired, hook can export + * user configurable map. + */ + return tos; +} + struct rds_transport rds_ib_transport = { .laddr_check = rds_ib_laddr_check, .xmit_path_complete = rds_ib_xmit_path_complete, @@ -537,6 +546,7 @@ struct rds_transport rds_ib_transport = { .sync_mr = rds_ib_sync_mr, .free_mr = rds_ib_free_mr, .flush_mrs = rds_ib_flush_mrs, + .get_tos_map = rds_ib_get_tos_map, .t_owner = THIS_MODULE, .t_name = "infiniband", .t_unloading = rds_ib_is_unloading, diff --git a/net/rds/rds.h b/net/rds/rds.h index 7e52b92092d71..0d8f67cadd747 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -574,6 +574,7 @@ struct rds_transport { void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); bool (*t_unloading)(struct rds_connection *conn); + u8 (*get_tos_map)(u8 tos); }; /* Bind hash table key length. It is the sum of the size of a struct diff --git a/net/rds/tcp.c b/net/rds/tcp.c index eb6851952cbf2..fd26941746074 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -453,6 +453,12 @@ static void rds_tcp_destroy_conns(void) static void rds_tcp_exit(void); +static u8 rds_tcp_get_tos_map(u8 tos) +{ + /* all user tos mapped to default 0 for TCP transport */ + return 0; +} + struct rds_transport rds_tcp_transport = { .laddr_check = rds_tcp_laddr_check, .xmit_path_prepare = rds_tcp_xmit_path_prepare, @@ -467,6 +473,7 @@ struct rds_transport rds_tcp_transport = { .inc_free = rds_tcp_inc_free, .stats_info_copy = rds_tcp_stats_info_copy, .exit = rds_tcp_exit, + .get_tos_map = rds_tcp_get_tos_map, .t_owner = THIS_MODULE, .t_name = "tcp", .t_type = RDS_TRANS_TCP, From fd261ce6a30e01ad67c416e2c67e263024b3a6f9 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 13 Oct 2018 22:13:23 +0800 Subject: [PATCH 5/5] rds: rdma: update rdma transport for tos For RDMA transports, RDS TOS is an extension of IB QoS(Annex A13) to provide clients the ability to segregate traffic flows for different type of data. RDMA CM abstract it for ULPs using rdma_set_service_type(). Internally, each traffic flow is represented by a connection with all of its independent resources like that of a normal connection, and is differentiated by service type. In other words, there can be multiple qp connections between an IP pair and each supports a unique service type. The feature has been added from RDSv4.1 onwards and supports rolling upgrades. RDMA connection metadata also carries the tos information to set up SL on end to end context. The original code was developed by Bang Nguyen in downstream kernel back in 2.6.32 kernel days and it has evolved over period of time. Reviewed-by: Sowmini Varadhan Signed-off-by: Santosh Shilimkar [yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes] Signed-off-by: Zhu Yanjun --- net/rds/ib.h | 4 +++- net/rds/ib_cm.c | 32 +++++++++++++++++--------------- net/rds/ib_recv.c | 4 ++-- net/rds/ib_send.c | 5 +++-- net/rds/rdma_transport.c | 1 + net/rds/send.c | 5 +++-- 6 files changed, 29 insertions(+), 22 deletions(-) diff --git a/net/rds/ib.h b/net/rds/ib.h index 71ff356ee7020..752f92235a380 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -67,7 +67,9 @@ struct rds_ib_conn_priv_cmn { u8 ricpc_protocol_major; u8 ricpc_protocol_minor; __be16 ricpc_protocol_minor_mask; /* bitmask */ - __be32 ricpc_reserved1; + u8 ricpc_dp_toss; + u8 ripc_reserved1; + __be16 ripc_reserved2; __be64 ricpc_ack_seq; __be32 ricpc_credit; /* non-zero enables flow ctl */ }; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 70518e329a9e5..66c6eb56072b7 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -144,9 +144,9 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } } - pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n", + pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n", ic->i_active_side ? "Active" : "Passive", - &conn->c_laddr, &conn->c_faddr, + &conn->c_laddr, &conn->c_faddr, conn->c_tos, RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version), ic->i_flowctl ? ", flow control" : ""); @@ -222,6 +222,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); dp->ricp_v6.dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic)); + dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos; conn_param->private_data = &dp->ricp_v6; conn_param->private_data_len = sizeof(dp->ricp_v6); @@ -236,6 +237,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); dp->ricp_v4.dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic)); + dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos; conn_param->private_data = &dp->ricp_v4; conn_param->private_data_len = sizeof(dp->ricp_v4); @@ -391,10 +393,9 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; default: - rdsdebug("Fatal QP Event %u (%s) " - "- connection %pI6c->%pI6c, reconnecting\n", - event->event, ib_event_msg(event->event), - &conn->c_laddr, &conn->c_faddr); + rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n", + event->event, ib_event_msg(event->event), + &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; } @@ -662,11 +663,11 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) /* Even if len is crap *now* I still want to check it. -ASG */ if (event->param.conn.private_data_len < data_len || major == 0) - return RDS_PROTOCOL_3_0; + return RDS_PROTOCOL_4_0; common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS; - if (major == 3 && common) { - version = RDS_PROTOCOL_3_0; + if (major == 4 && common) { + version = RDS_PROTOCOL_4_0; while ((common >>= 1) != 0) version++; } else if (RDS_PROTOCOL_COMPAT_VERSION == @@ -778,15 +779,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, daddr6 = &d_mapped_addr; } - rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid " - "0x%llx\n", saddr6, daddr6, - RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), + rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n", + saddr6, daddr6, RDS_PROTOCOL_MAJOR(version), + RDS_PROTOCOL_MINOR(version), (unsigned long long)be64_to_cpu(lguid), - (unsigned long long)be64_to_cpu(fguid)); + (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss); /* RDS/IB is not currently netns aware, thus init_net */ conn = rds_conn_create(&init_net, daddr6, saddr6, - &rds_ib_transport, 0, GFP_KERNEL, ifindex); + &rds_ib_transport, dp_cmn->ricpc_dp_toss, + GFP_KERNEL, ifindex); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; @@ -868,7 +870,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) /* If the peer doesn't do protocol negotiation, we must * default to RDSv3.0 */ - rds_ib_set_protocol(conn, RDS_PROTOCOL_VERSION); + rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1); ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ ret = rds_ib_setup_qp(conn); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 2f16146e4ec94..d395eec98959e 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -986,9 +986,9 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, - wc->status, + conn->c_tos, wc->status, ib_wc_status_msg(wc->status)); } diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 4e0c36acf8660..09c46f2e97fad 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -305,8 +305,9 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", - &conn->c_laddr, &conn->c_faddr, wc->status, + rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n", + &conn->c_laddr, &conn->c_faddr, + conn->c_tos, wc->status, ib_wc_status_msg(wc->status)); } } diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index e37f91537d297..46bce83890660 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -83,6 +83,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ADDR_RESOLVED: + rdma_set_service_type(cm_id, conn->c_tos); /* XXX do we need to clean up if this fails? */ ret = rdma_resolve_route(cm_id, RDS_RDMA_RESOLVE_TIMEOUT_MS); diff --git a/net/rds/send.c b/net/rds/send.c index c555e121b908b..166dd578c1cc9 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1277,12 +1277,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ - if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) { + if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) && + rs->rs_tos == rs->rs_conn->c_tos) { conn = rs->rs_conn; } else { conn = rds_conn_create_outgoing(sock_net(sock->sk), &rs->rs_bound_addr, &daddr, - rs->rs_transport, 0, + rs->rs_transport, rs->rs_tos, sock->sk->sk_allocation, scope_id); if (IS_ERR(conn)) {