Skip to content

Commit

Permalink
Merge branch 'for_net-next-5.1/rds-tos-v4' of git://git.kernel.org/pu…
Browse files Browse the repository at this point in the history
…b/scm/linux/kernel/git/ssantosh/linux

Santosh Shilimkar says:

====================
rds: add tos support

RDS applications make use of tos to classify database traffic.
This feature has been used in shipping products from 2.6.32 based
kernels. Its tied with RDS v4.1 protocol version and the compatibility
gets negotiated as part of connections setup.

Patchset keeps full backward compatibility using existing connection
negotiation scheme. Currently the feature is exploited by RDMA
transport and for TCP transport the user tos values are mapped to
same default class (0).

For RDMA transports, RDMA CM service type API is used to
set up different SL(service lanes) and the IB fabric is configured
for tos mapping using Subnet Manager(SL to VL mappings).
Similarly for ROCE fabric, user priority is mapped with different
DSCP code points which are associated with different switch queues
in the fabric.

The original code was developed by Bang Nguyen in downstream kernel back in
2.6.32 kernel days and it has evolved significantly over period of time.

Thanks to Yanjun for doing testing with various combinations of host like
v3.1<->v4.1, v4.1.<->v3.1, v4.1 upstream to shipping v4.1 etc etc
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Feb 7, 2019
2 parents e90b1fd + fd261ce commit 042a419
Show file tree
Hide file tree
Showing 16 changed files with 166 additions and 52 deletions.
11 changes: 11 additions & 0 deletions include/uapi/linux/rds.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@
#define RDS_TRANS_COUNT 3
#define RDS_TRANS_NONE (~0)

/* IOCTLS commands for SOL_RDS */
#define SIOCRDSSETTOS (SIOCPROTOPRIVATE)
#define SIOCRDSGETTOS (SIOCPROTOPRIVATE + 1)

typedef __u8 rds_tos_t;

/*
* Control message types for SOL_RDS.
*
Expand Down Expand Up @@ -149,6 +155,7 @@ struct rds_info_connection {
__be32 faddr;
__u8 transport[TRANSNAMSIZ]; /* null term ascii */
__u8 flags;
__u8 tos;
} __attribute__((packed));

struct rds6_info_connection {
Expand All @@ -171,6 +178,7 @@ struct rds_info_message {
__be16 lport;
__be16 fport;
__u8 flags;
__u8 tos;
} __attribute__((packed));

struct rds6_info_message {
Expand Down Expand Up @@ -214,6 +222,7 @@ struct rds_info_tcp_socket {
__u32 last_sent_nxt;
__u32 last_expected_una;
__u32 last_seen_una;
__u8 tos;
} __attribute__((packed));

struct rds6_info_tcp_socket {
Expand All @@ -240,6 +249,7 @@ struct rds_info_rdma_connection {
__u32 max_send_sge;
__u32 rdma_mr_max;
__u32 rdma_mr_size;
__u8 tos;
};

struct rds6_info_rdma_connection {
Expand All @@ -253,6 +263,7 @@ struct rds6_info_rdma_connection {
__u32 max_send_sge;
__u32 rdma_mr_max;
__u32 rdma_mr_size;
__u8 tos;
};

/* RDS message Receive Path Latency points */
Expand Down
37 changes: 36 additions & 1 deletion net/rds/af_rds.c
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,40 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,

static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
return -ENOIOCTLCMD;
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
rds_tos_t utos, tos = 0;

switch (cmd) {
case SIOCRDSSETTOS:
if (get_user(utos, (rds_tos_t __user *)arg))
return -EFAULT;

if (rs->rs_transport &&
rs->rs_transport->get_tos_map)
tos = rs->rs_transport->get_tos_map(utos);
else
return -ENOIOCTLCMD;

spin_lock_bh(&rds_sock_lock);
if (rs->rs_tos || rs->rs_conn) {
spin_unlock_bh(&rds_sock_lock);
return -EINVAL;
}
rs->rs_tos = tos;
spin_unlock_bh(&rds_sock_lock);
break;
case SIOCRDSGETTOS:
spin_lock_bh(&rds_sock_lock);
tos = rs->rs_tos;
spin_unlock_bh(&rds_sock_lock);
if (put_user(tos, (rds_tos_t __user *)arg))
return -EFAULT;
break;
default:
return -ENOIOCTLCMD;
}

return 0;
}

static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
Expand Down Expand Up @@ -650,6 +683,8 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
spin_lock_init(&rs->rs_rdma_lock);
rs->rs_rdma_keys = RB_ROOT;
rs->rs_rx_traces = 0;
rs->rs_tos = 0;
rs->rs_conn = NULL;

spin_lock_bh(&rds_sock_lock);
list_add_tail(&rs->rs_item, &rds_sock_list);
Expand Down
21 changes: 12 additions & 9 deletions net/rds/connection.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,15 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
struct rds_transport *trans,
int dev_if)
u8 tos, int dev_if)
{
struct rds_connection *conn, *ret = NULL;

hlist_for_each_entry_rcu(conn, head, c_hash_node) {
if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
ipv6_addr_equal(&conn->c_laddr, laddr) &&
conn->c_trans == trans &&
conn->c_tos == tos &&
net == rds_conn_net(conn) &&
conn->c_dev_if == dev_if) {
ret = conn;
Expand Down Expand Up @@ -139,6 +140,7 @@ static void __rds_conn_path_init(struct rds_connection *conn,
atomic_set(&cp->cp_state, RDS_CONN_DOWN);
cp->cp_send_gen = 0;
cp->cp_reconnect_jiffies = 0;
cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
Expand All @@ -159,7 +161,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
struct rds_transport *trans,
gfp_t gfp,
gfp_t gfp, u8 tos,
int is_outgoing,
int dev_if)
{
Expand All @@ -171,7 +173,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);

rcu_read_lock();
conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if);
if (conn &&
conn->c_loopback &&
conn->c_trans != &rds_loop_transport &&
Expand Down Expand Up @@ -205,6 +207,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
conn->c_faddr = *faddr;
conn->c_dev_if = dev_if;
conn->c_tos = tos;

#if IS_ENABLED(CONFIG_IPV6)
/* If the local address is link local, set c_bound_if to be the
Expand Down Expand Up @@ -297,7 +300,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct rds_connection *found;

found = rds_conn_lookup(net, head, laddr, faddr, trans,
dev_if);
tos, dev_if);
if (found) {
struct rds_conn_path *cp;
int i;
Expand Down Expand Up @@ -332,20 +335,20 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct rds_connection *rds_conn_create(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
struct rds_transport *trans, gfp_t gfp,
int dev_if)
struct rds_transport *trans, u8 tos,
gfp_t gfp, int dev_if)
{
return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create);

struct rds_connection *rds_conn_create_outgoing(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
struct rds_transport *trans,
gfp_t gfp, int dev_if)
u8 tos, gfp_t gfp, int dev_if)
{
return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);

Expand Down
11 changes: 11 additions & 0 deletions net/rds/ib.c
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,

iinfo->src_addr = conn->c_laddr.s6_addr32[3];
iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
iinfo->tos = conn->c_tos;

memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
Expand Down Expand Up @@ -514,6 +515,15 @@ void rds_ib_exit(void)
rds_ib_mr_exit();
}

static u8 rds_ib_get_tos_map(u8 tos)
{
/* 1:1 user to transport map for RDMA transport.
* In future, if custom map is desired, hook can export
* user configurable map.
*/
return tos;
}

struct rds_transport rds_ib_transport = {
.laddr_check = rds_ib_laddr_check,
.xmit_path_complete = rds_ib_xmit_path_complete,
Expand All @@ -536,6 +546,7 @@ struct rds_transport rds_ib_transport = {
.sync_mr = rds_ib_sync_mr,
.free_mr = rds_ib_free_mr,
.flush_mrs = rds_ib_flush_mrs,
.get_tos_map = rds_ib_get_tos_map,
.t_owner = THIS_MODULE,
.t_name = "infiniband",
.t_unloading = rds_ib_is_unloading,
Expand Down
4 changes: 3 additions & 1 deletion net/rds/ib.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ struct rds_ib_conn_priv_cmn {
u8 ricpc_protocol_major;
u8 ricpc_protocol_minor;
__be16 ricpc_protocol_minor_mask; /* bitmask */
__be32 ricpc_reserved1;
u8 ricpc_dp_toss;
u8 ripc_reserved1;
__be16 ripc_reserved2;
__be64 ricpc_ack_seq;
__be32 ricpc_credit; /* non-zero enables flow ctl */
};
Expand Down
72 changes: 41 additions & 31 deletions net/rds/ib_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,23 +133,24 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_ib_set_flow_control(conn, be32_to_cpu(credit));
}

if (conn->c_version < RDS_PROTOCOL(3, 1)) {
pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version));
set_bit(RDS_DESTROY_PENDING, &conn->c_path[0].cp_flags);
rds_conn_destroy(conn);
return;
} else {
pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n",
ic->i_active_side ? "Active" : "Passive",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : "");
if (conn->c_version < RDS_PROTOCOL_VERSION) {
if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) {
pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version));
rds_conn_destroy(conn);
return;
}
}

pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n",
ic->i_active_side ? "Active" : "Passive",
&conn->c_laddr, &conn->c_faddr, conn->c_tos,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : "");

atomic_set(&ic->i_cq_quiesce, 0);

/* Init rings and fill recv. this needs to wait until protocol
Expand Down Expand Up @@ -184,6 +185,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
NULL);
}

conn->c_proposed_version = conn->c_version;
rds_connect_complete(conn);
}

Expand Down Expand Up @@ -220,6 +222,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
dp->ricp_v6.dp_ack_seq =
cpu_to_be64(rds_ib_piggyb_ack(ic));
dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos;

conn_param->private_data = &dp->ricp_v6;
conn_param->private_data_len = sizeof(dp->ricp_v6);
Expand All @@ -234,6 +237,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
dp->ricp_v4.dp_ack_seq =
cpu_to_be64(rds_ib_piggyb_ack(ic));
dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos;

conn_param->private_data = &dp->ricp_v4;
conn_param->private_data_len = sizeof(dp->ricp_v4);
Expand Down Expand Up @@ -389,10 +393,9 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break;
default:
rdsdebug("Fatal QP Event %u (%s) "
"- connection %pI6c->%pI6c, reconnecting\n",
event->event, ib_event_msg(event->event),
&conn->c_laddr, &conn->c_faddr);
rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n",
event->event, ib_event_msg(event->event),
&conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
break;
}
Expand Down Expand Up @@ -660,13 +663,16 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)

/* Even if len is crap *now* I still want to check it. -ASG */
if (event->param.conn.private_data_len < data_len || major == 0)
return RDS_PROTOCOL_3_0;
return RDS_PROTOCOL_4_0;

common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
if (major == 3 && common) {
version = RDS_PROTOCOL_3_0;
if (major == 4 && common) {
version = RDS_PROTOCOL_4_0;
while ((common >>= 1) != 0)
version++;
} else if (RDS_PROTOCOL_COMPAT_VERSION ==
RDS_PROTOCOL(major, minor)) {
version = RDS_PROTOCOL_COMPAT_VERSION;
} else {
if (isv6)
printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
Expand Down Expand Up @@ -729,8 +735,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,

/* Check whether the remote protocol version matches ours. */
version = rds_ib_protocol_compatible(event, isv6);
if (!version)
if (!version) {
err = RDS_RDMA_REJ_INCOMPAT;
goto out;
}

dp = event->param.conn.private_data;
if (isv6) {
Expand Down Expand Up @@ -771,15 +779,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
daddr6 = &d_mapped_addr;
}

rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid "
"0x%llx\n", saddr6, daddr6,
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n",
saddr6, daddr6, RDS_PROTOCOL_MAJOR(version),
RDS_PROTOCOL_MINOR(version),
(unsigned long long)be64_to_cpu(lguid),
(unsigned long long)be64_to_cpu(fguid));
(unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss);

/* RDS/IB is not currently netns aware, thus init_net */
conn = rds_conn_create(&init_net, daddr6, saddr6,
&rds_ib_transport, GFP_KERNEL, ifindex);
&rds_ib_transport, dp_cmn->ricpc_dp_toss,
GFP_KERNEL, ifindex);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
Expand Down Expand Up @@ -846,7 +855,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
if (conn)
mutex_unlock(&conn->c_cm_lock);
if (err)
rdma_reject(cm_id, NULL, 0);
rdma_reject(cm_id, &err, sizeof(int));
return destroy;
}

Expand All @@ -861,7 +870,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)

/* If the peer doesn't do protocol negotiation, we must
* default to RDSv3.0 */
rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */

ret = rds_ib_setup_qp(conn);
Expand All @@ -870,7 +879,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
goto out;
}

rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
conn->c_proposed_version,
UINT_MAX, UINT_MAX, isv6);
ret = rdma_connect(cm_id, &conn_param);
if (ret)
Expand Down
Loading

0 comments on commit 042a419

Please sign in to comment.