Skip to content

Commit

Permalink
Merge branch 'rds-ipv6'
Browse files Browse the repository at this point in the history
Ka-Cheong Poon says:

====================
rds: IPv6 support

This patch set adds IPv6 support to the kernel RDS and related
modules.  Existing RDS apps using IPv4 address continue to run without
any problem.  New RDS apps which want to use IPv6 address can do so by
passing the address in struct sockaddr_in6 to bind(), connect() or
sendmsg().  And those apps also need to use the new IPv6 equivalents
of some of the existing socket options as the existing options use a
32 bit integer to store IP address.

All RDS code now use struct in6_addr to store IP address.  IPv4
address is stored as an IPv4 mapped address.

Header file changes

There are many data structures (RDS socket options) used by RDS apps
which use a 32 bit integer to store IP address. To support IPv6,
struct in6_addr needs to be used. To ensure backward compatibility, a
new data structure is introduced for each of those data structures
which use a 32 bit integer to represent an IP address. And new socket
options are introduced to use those new structures. This means that
existing apps should work without a problem with the new RDS module.
For apps which want to use IPv6, those new data structures and socket
options can be used. IPv4 mapped address is used to represent IPv4
address in the new data structures.

Internally, all RDS data structures which contain an IP address are
changed to use struct in6_addr to store the address. IPv4 address is
stored as an IPv4 mapped address. All the functions which take an IP
address as argument are also changed to use struct in6_addr.

RDS/RDMA/IB uses a private data (struct rds_ib_connect_private)
exchange between endpoints at RDS connection establishment time to
support RDMA. This private data exchange uses a 32 bit integer to
represent an IP address. This needs to be changed in order to support
IPv6. A new private data struct rds6_ib_connect_private is introduced
to handle this. To ensure backward compatibility, an IPv6 capable RDS
stack uses another RDMA listener port (RDS_CM_PORT) to accept IPv6
connection. And it continues to use the original RDS_PORT for IPv4 RDS
connections. When it needs to communicate with an IPv6 peer, it uses
the RDS_TCP_PORT to send the connection set up request.

RDS/TCP changes

TCP related code is changed to support IPv6.  Note that only an IPv6
TCP listener on port RDS_TCP_PORT is created as it can accept both
IPv4 and IPv6 connection requests.

IB/RDMA changes

The initial private data exchange between IB endpoints using RDMA is
changed to support IPv6 address instead, if the peer address is IPv6.
To ensure backward compatibility, annother RDMA listener port
(RDS_CM_PORT) is used to accept IPv6 connection. An IPv6 capable RDS
module continues to use the original RDS_PORT for IPv4 RDS
connections. When it needs to communicate with an IPv6 peer, it uses
the RDS_CM_PORT to send the connection set up request.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Jul 24, 2018
2 parents a6c90dd + b7ff8b1 commit 176bd86
Show file tree
Hide file tree
Showing 27 changed files with 1,543 additions and 422 deletions.
69 changes: 67 additions & 2 deletions include/uapi/linux/rds.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
/*
* Copyright (c) 2008 Oracle. All rights reserved.
* Copyright (c) 2008, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
Expand Down Expand Up @@ -118,7 +118,17 @@
#define RDS_INFO_IB_CONNECTIONS 10008
#define RDS_INFO_CONNECTION_STATS 10009
#define RDS_INFO_IWARP_CONNECTIONS 10010
#define RDS_INFO_LAST 10010

/* PF_RDS6 options */
#define RDS6_INFO_CONNECTIONS 10011
#define RDS6_INFO_SEND_MESSAGES 10012
#define RDS6_INFO_RETRANS_MESSAGES 10013
#define RDS6_INFO_RECV_MESSAGES 10014
#define RDS6_INFO_SOCKETS 10015
#define RDS6_INFO_TCP_SOCKETS 10016
#define RDS6_INFO_IB_CONNECTIONS 10017

#define RDS_INFO_LAST 10017

struct rds_info_counter {
__u8 name[32];
Expand All @@ -140,6 +150,15 @@ struct rds_info_connection {
__u8 flags;
} __attribute__((packed));

struct rds6_info_connection {
__u64 next_tx_seq;
__u64 next_rx_seq;
struct in6_addr laddr;
struct in6_addr faddr;
__u8 transport[TRANSNAMSIZ]; /* null term ascii */
__u8 flags;
} __attribute__((packed));

#define RDS_INFO_MESSAGE_FLAG_ACK 0x01
#define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02

Expand All @@ -153,6 +172,17 @@ struct rds_info_message {
__u8 flags;
} __attribute__((packed));

struct rds6_info_message {
__u64 seq;
__u32 len;
struct in6_addr laddr;
struct in6_addr faddr;
__be16 lport;
__be16 fport;
__u8 flags;
__u8 tos;
} __attribute__((packed));

struct rds_info_socket {
__u32 sndbuf;
__be32 bound_addr;
Expand All @@ -163,6 +193,16 @@ struct rds_info_socket {
__u64 inum;
} __attribute__((packed));

struct rds6_info_socket {
__u32 sndbuf;
struct in6_addr bound_addr;
struct in6_addr connected_addr;
__be16 bound_port;
__be16 connected_port;
__u32 rcvbuf;
__u64 inum;
} __attribute__((packed));

struct rds_info_tcp_socket {
__be32 local_addr;
__be16 local_port;
Expand All @@ -175,6 +215,18 @@ struct rds_info_tcp_socket {
__u32 last_seen_una;
} __attribute__((packed));

struct rds6_info_tcp_socket {
struct in6_addr local_addr;
__be16 local_port;
struct in6_addr peer_addr;
__be16 peer_port;
__u64 hdr_rem;
__u64 data_rem;
__u32 last_sent_nxt;
__u32 last_expected_una;
__u32 last_seen_una;
} __attribute__((packed));

#define RDS_IB_GID_LEN 16
struct rds_info_rdma_connection {
__be32 src_addr;
Expand All @@ -189,6 +241,19 @@ struct rds_info_rdma_connection {
__u32 rdma_mr_size;
};

struct rds6_info_rdma_connection {
struct in6_addr src_addr;
struct in6_addr dst_addr;
__u8 src_gid[RDS_IB_GID_LEN];
__u8 dst_gid[RDS_IB_GID_LEN];

__u32 max_send_wr;
__u32 max_recv_wr;
__u32 max_send_sge;
__u32 rdma_mr_max;
__u32 rdma_mr_size;
};

/* RDS message Receive Path Latency points */
enum rds_message_rxpath_latency {
RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
Expand Down
201 changes: 162 additions & 39 deletions net/rds/af_rds.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
* Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
Expand Down Expand Up @@ -35,6 +35,7 @@
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/in.h>
#include <linux/ipv6.h>
#include <linux/poll.h>
#include <net/sock.h>

Expand Down Expand Up @@ -113,26 +114,80 @@ void rds_wake_sk_sleep(struct rds_sock *rs)
static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
int peer)
{
struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sock->sk);

memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
struct sockaddr_in6 *sin6;
struct sockaddr_in *sin;
int uaddr_len;

/* racey, don't care */
if (peer) {
if (!rs->rs_conn_addr)
if (ipv6_addr_any(&rs->rs_conn_addr))
return -ENOTCONN;

sin->sin_port = rs->rs_conn_port;
sin->sin_addr.s_addr = rs->rs_conn_addr;
if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
sin = (struct sockaddr_in *)uaddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
sin->sin_family = AF_INET;
sin->sin_port = rs->rs_conn_port;
sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
uaddr_len = sizeof(*sin);
} else {
sin6 = (struct sockaddr_in6 *)uaddr;
sin6->sin6_family = AF_INET6;
sin6->sin6_port = rs->rs_conn_port;
sin6->sin6_addr = rs->rs_conn_addr;
sin6->sin6_flowinfo = 0;
/* scope_id is the same as in the bound address. */
sin6->sin6_scope_id = rs->rs_bound_scope_id;
uaddr_len = sizeof(*sin6);
}
} else {
sin->sin_port = rs->rs_bound_port;
sin->sin_addr.s_addr = rs->rs_bound_addr;
/* If socket is not yet bound and the socket is connected,
* set the return address family to be the same as the
* connected address, but with 0 address value. If it is not
* connected, set the family to be AF_UNSPEC (value 0) and
* the address size to be that of an IPv4 address.
*/
if (ipv6_addr_any(&rs->rs_bound_addr)) {
if (ipv6_addr_any(&rs->rs_conn_addr)) {
sin = (struct sockaddr_in *)uaddr;
memset(sin, 0, sizeof(*sin));
sin->sin_family = AF_UNSPEC;
return sizeof(*sin);
}

if (ipv6_addr_type(&rs->rs_conn_addr) &
IPV6_ADDR_MAPPED) {
sin = (struct sockaddr_in *)uaddr;
memset(sin, 0, sizeof(*sin));
sin->sin_family = AF_INET;
return sizeof(*sin);
}

sin6 = (struct sockaddr_in6 *)uaddr;
memset(sin6, 0, sizeof(*sin6));
sin6->sin6_family = AF_INET6;
return sizeof(*sin6);
}
if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
sin = (struct sockaddr_in *)uaddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
sin->sin_family = AF_INET;
sin->sin_port = rs->rs_bound_port;
sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
uaddr_len = sizeof(*sin);
} else {
sin6 = (struct sockaddr_in6 *)uaddr;
sin6->sin6_family = AF_INET6;
sin6->sin6_port = rs->rs_bound_port;
sin6->sin6_addr = rs->rs_bound_addr;
sin6->sin6_flowinfo = 0;
sin6->sin6_scope_id = rs->rs_bound_scope_id;
uaddr_len = sizeof(*sin6);
}
}

sin->sin_family = AF_INET;

return sizeof(*sin);
return uaddr_len;
}

/*
Expand Down Expand Up @@ -203,26 +258,36 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
int len)
{
struct sockaddr_in6 sin6;
struct sockaddr_in sin;
int ret = 0;

/* racing with another thread binding seems ok here */
if (rs->rs_bound_addr == 0) {
if (ipv6_addr_any(&rs->rs_bound_addr)) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}

if (len < sizeof(struct sockaddr_in)) {
ret = -EINVAL;
goto out;
} else if (len < sizeof(struct sockaddr_in6)) {
/* Assume IPv4 */
if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
ret = -EFAULT;
goto out;
}
ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
sin6.sin6_port = sin.sin_port;
} else {
if (copy_from_user(&sin6, optval,
sizeof(struct sockaddr_in6))) {
ret = -EFAULT;
goto out;
}
}

if (copy_from_user(&sin, optval, sizeof(sin))) {
ret = -EFAULT;
goto out;
}

rds_send_drop_to(rs, &sin);
rds_send_drop_to(rs, &sin6);
out:
return ret;
}
Expand Down Expand Up @@ -435,31 +500,87 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct sockaddr_in *sin;
struct sockaddr_in6 *sin6;
struct rds_sock *rs = rds_sk_to_rs(sk);
int addr_type;
int ret = 0;

lock_sock(sk);

if (addr_len != sizeof(struct sockaddr_in)) {
ret = -EINVAL;
goto out;
}
switch (uaddr->sa_family) {
case AF_INET:
sin = (struct sockaddr_in *)uaddr;
if (addr_len < sizeof(struct sockaddr_in)) {
ret = -EINVAL;
break;
}
if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
ret = -EDESTADDRREQ;
break;
}
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
ret = -EINVAL;
break;
}
ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
rs->rs_conn_port = sin->sin_port;
break;

if (sin->sin_family != AF_INET) {
ret = -EAFNOSUPPORT;
goto out;
}
case AF_INET6:
sin6 = (struct sockaddr_in6 *)uaddr;
if (addr_len < sizeof(struct sockaddr_in6)) {
ret = -EINVAL;
break;
}
addr_type = ipv6_addr_type(&sin6->sin6_addr);
if (!(addr_type & IPV6_ADDR_UNICAST)) {
__be32 addr4;

if (!(addr_type & IPV6_ADDR_MAPPED)) {
ret = -EPROTOTYPE;
break;
}

/* It is a mapped address. Need to do some sanity
* checks.
*/
addr4 = sin6->sin6_addr.s6_addr32[3];
if (addr4 == htonl(INADDR_ANY) ||
addr4 == htonl(INADDR_BROADCAST) ||
IN_MULTICAST(ntohl(addr4))) {
ret = -EPROTOTYPE;
break;
}
}

if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
ret = -EDESTADDRREQ;
goto out;
}
if (addr_type & IPV6_ADDR_LINKLOCAL) {
/* If socket is arleady bound to a link local address,
* the peer address must be on the same link.
*/
if (sin6->sin6_scope_id == 0 ||
(!ipv6_addr_any(&rs->rs_bound_addr) &&
rs->rs_bound_scope_id &&
sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
ret = -EINVAL;
break;
}
/* Remember the connected address scope ID. It will
* be checked against the binding local address when
* the socket is bound.
*/
rs->rs_bound_scope_id = sin6->sin6_scope_id;
}
rs->rs_conn_addr = sin6->sin6_addr;
rs->rs_conn_port = sin6->sin6_port;
break;

rs->rs_conn_addr = sin->sin_addr.s_addr;
rs->rs_conn_port = sin->sin_port;
default:
ret = -EAFNOSUPPORT;
break;
}

out:
release_sock(sk);
return ret;
}
Expand Down Expand Up @@ -578,8 +699,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
total++;
if (total <= len)
rds_inc_info_copy(inc, iter, inc->i_saddr,
rs->rs_bound_addr, 1);
rds_inc_info_copy(inc, iter,
inc->i_saddr.s6_addr32[3],
rs->rs_bound_addr_v4,
1);
}

read_unlock(&rs->rs_recv_lock);
Expand Down Expand Up @@ -608,8 +731,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
list_for_each_entry(rs, &rds_sock_list, rs_item) {
sinfo.sndbuf = rds_sk_sndbuf(rs);
sinfo.rcvbuf = rds_sk_rcvbuf(rs);
sinfo.bound_addr = rs->rs_bound_addr;
sinfo.connected_addr = rs->rs_conn_addr;
sinfo.bound_addr = rs->rs_bound_addr_v4;
sinfo.connected_addr = rs->rs_conn_addr_v4;
sinfo.bound_port = rs->rs_bound_port;
sinfo.connected_port = rs->rs_conn_port;
sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
Expand Down
Loading

0 comments on commit 176bd86

Please sign in to comment.