From 6f89dbce8e1134458de8a8e376acaaca4eee602e Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:32 -0800 Subject: [PATCH 1/7] skbuff: export mm_[un]account_pinned_pages for other modules RDS would like to use the helper functions for managing pinned pages added by Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages") Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ net/core/skbuff.c | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ebc0f869720a..b1cc38af53e16 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -466,6 +466,9 @@ struct ubuf_info { #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) +int mm_account_pinned_pages(struct mmpin *mmp, size_t size); +void mm_unaccount_pinned_pages(struct mmpin *mmp); + struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 09bd89c90a71c..1a7485a2cdfa8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); -static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) +int mm_account_pinned_pages(struct mmpin *mmp, size_t size) { unsigned long max_pg, num_pg, new_pg, old_pg; struct user_struct *user; @@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) return 0; } +EXPORT_SYMBOL_GPL(mm_account_pinned_pages); -static void mm_unaccount_pinned_pages(struct mmpin *mmp) +void mm_unaccount_pinned_pages(struct mmpin *mmp) { if (mmp->user) { atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); free_uid(mmp->user); } } +EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) { From ea8994cb0118993da179af836cc72a583d75dc4b Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:33 -0800 Subject: [PATCH 2/7] rds: hold a sock ref from rds_message to the rds_sock The existing model holds a reference from the rds_sock to the rds_message, but the rds_message does not itself hold a sock_put() on the rds_sock. Instead the m_rs field in the rds_message is assigned when the message is queued on the sock, and nulled when the message is dequeued from the sock. We want to be able to notify userspace when the rds_message is actually freed (from rds_message_purge(), after the refcounts to the rds_message go to 0). At the time that rds_message_purge() is called, the message is no longer on the rds_sock retransmit queue. Thus the explicit reference for the m_rs is needed to send a notification that will signal to userspace that it is now safe to free/reuse any pages that may have been pinned down for zerocopy. This patch manages the m_rs assignment in the rds_message with the necessary refcount book-keeping. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/message.c | 8 +++++++- net/rds/send.c | 7 +------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/net/rds/message.c b/net/rds/message.c index 4318cc9b78f7b..ef3daafa3d79e 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(rds_message_addref); */ static void rds_message_purge(struct rds_message *rm) { - unsigned long i; + unsigned long i, flags; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; @@ -69,6 +69,12 @@ static void rds_message_purge(struct rds_message *rm) __free_page(sg_page(&rm->data.op_sg[i])); } rm->data.op_nents = 0; + spin_lock_irqsave(&rm->m_rs_lock, flags); + if (rm->m_rs) { + sock_put(rds_rs_to_sk(rm->m_rs)); + rm->m_rs = NULL; + } + spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rm->rdma.op_active) rds_rdma_free_op(&rm->rdma); diff --git a/net/rds/send.c b/net/rds/send.c index b1b0022b8370b..e8f3ff471b15b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -649,7 +649,6 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status) rm->rdma.op_notifier = NULL; } was_on_sock = 1; - rm->m_rs = NULL; } spin_unlock(&rs->rs_lock); @@ -756,9 +755,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&cp->cp_lock, flags); - spin_lock_irqsave(&rm->m_rs_lock, flags); - rm->m_rs = NULL; - spin_unlock_irqrestore(&rm->m_rs_lock, flags); continue; } list_del_init(&rm->m_conn_item); @@ -774,7 +770,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); @@ -798,7 +793,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); @@ -849,6 +843,7 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); rds_message_addref(rm); + sock_hold(rds_rs_to_sk(rs)); rm->m_rs = rs; /* The code ordering is a little weird, but we're From 28190752c709272de3c2b6b092029da3f1614c5a Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:34 -0800 Subject: [PATCH 3/7] sock: permit SO_ZEROCOPY on PF_RDS socket allow the application to set SO_ZEROCOPY on the underlying sk of a PF_RDS socket Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index e90d461748f0f..a1fa4a548f1be 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1049,18 +1049,21 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_ZEROCOPY: - if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { + if (sk->sk_protocol != IPPROTO_TCP) + ret = -ENOTSUPP; + else if (sk->sk_state != TCP_CLOSE) + ret = -EBUSY; + } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; - else if (sk->sk_protocol != IPPROTO_TCP) - ret = -ENOTSUPP; - else if (sk->sk_state != TCP_CLOSE) - ret = -EBUSY; - else if (val < 0 || val > 1) - ret = -EINVAL; - else - sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); - break; - + } + if (!ret) { + if (val < 0 || val > 1) + ret = -EINVAL; + else + sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + break; + } default: ret = -ENOPROTOOPT; break; From 01883eda72bd3f0a6c81447e4f223de14033fd9d Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:35 -0800 Subject: [PATCH 4/7] rds: support for zcopy completion notification RDS removes a datagram (rds_message) from the retransmit queue when an ACK is received. The ACK indicates that the receiver has queued the RDS datagram, so that the sender can safely forget the datagram. When all references to the rds_message are quiesced, rds_message_purge is called to release resources used by the rds_message If the datagram to be removed had pinned pages set up, add an entry to the rs->rs_znotify_queue so that the notifcation will be sent up via rds_rm_zerocopy_callback() when the rds_message is eventually freed by rds_message_purge. rds_rm_zerocopy_callback() attempts to batch the number of cookies sent with each notification to a max of SO_EE_ORIGIN_MAX_ZCOOKIES. This is achieved by checking the tail skb in the sk_error_queue: if this has room for one more cookie, the cookie from the current notification is added; else a new skb is added to the sk_error_queue. Every invocation of rds_rm_zerocopy_callback() will trigger a ->sk_error_report to notify the application. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/errqueue.h | 2 + net/rds/af_rds.c | 2 + net/rds/message.c | 83 ++++++++++++++++++++++++++++++++--- net/rds/rds.h | 14 ++++++ net/rds/recv.c | 2 + 5 files changed, 96 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index dc64cfaf13da0..28812eda42090 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,11 +20,13 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 +#define SO_EE_ORIGIN_ZCOOKIE 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 +#define SO_EE_ORIGIN_MAX_ZCOOKIES 8 /** * struct scm_timestamping - timestamps exposed through cmsg diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 0a8eefd256b3e..a937f18896aee 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -182,6 +182,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ diff --git a/net/rds/message.c b/net/rds/message.c index ef3daafa3d79e..bf1a656b198a0 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include #include "rds.h" @@ -53,29 +56,95 @@ void rds_message_addref(struct rds_message *rm) } EXPORT_SYMBOL_GPL(rds_message_addref); +static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + int ncookies; + u32 *ptr; + + if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE) + return false; + ncookies = serr->ee.ee_data; + if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES) + return false; + ptr = skb_put(skb, sizeof(u32)); + *ptr = cookie; + serr->ee.ee_data = ++ncookies; + return true; +} + +static void rds_rm_zerocopy_callback(struct rds_sock *rs, + struct rds_znotifier *znotif) +{ + struct sock *sk = rds_rs_to_sk(rs); + struct sk_buff *skb, *tail; + struct sock_exterr_skb *serr; + unsigned long flags; + struct sk_buff_head *q; + u32 cookie = znotif->z_cookie; + + q = &sk->sk_error_queue; + spin_lock_irqsave(&q->lock, flags); + tail = skb_peek_tail(q); + + if (tail && skb_zcookie_add(tail, cookie)) { + spin_unlock_irqrestore(&q->lock, flags); + mm_unaccount_pinned_pages(&znotif->z_mmp); + consume_skb(rds_skb_from_znotifier(znotif)); + sk->sk_error_report(sk); + return; + } + + skb = rds_skb_from_znotifier(znotif); + serr = SKB_EXT_ERR(skb); + memset(&serr->ee, 0, sizeof(serr->ee)); + serr->ee.ee_errno = 0; + serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE; + serr->ee.ee_info = 0; + WARN_ON(!skb_zcookie_add(skb, cookie)); + + __skb_queue_tail(q, skb); + + spin_unlock_irqrestore(&q->lock, flags); + sk->sk_error_report(sk); + + mm_unaccount_pinned_pages(&znotif->z_mmp); +} + /* * This relies on dma_map_sg() not touching sg[].page during merging. */ static void rds_message_purge(struct rds_message *rm) { unsigned long i, flags; + bool zcopy = false; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; - for (i = 0; i < rm->data.op_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); - /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->data.op_sg[i])); - } - rm->data.op_nents = 0; spin_lock_irqsave(&rm->m_rs_lock, flags); if (rm->m_rs) { - sock_put(rds_rs_to_sk(rm->m_rs)); + struct rds_sock *rs = rm->m_rs; + + if (rm->data.op_mmp_znotifier) { + zcopy = true; + rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + rm->data.op_mmp_znotifier = NULL; + } + sock_put(rds_rs_to_sk(rs)); rm->m_rs = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); + for (i = 0; i < rm->data.op_nents; i++) { + /* XXX will have to put_page for page refs */ + if (!zcopy) + __free_page(sg_page(&rm->data.op_sg[i])); + else + put_page(sg_page(&rm->data.op_sg[i])); + } + rm->data.op_nents = 0; + if (rm->rdma.op_active) rds_rdma_free_op(&rm->rdma); if (rm->rdma.op_rdma_mr) diff --git a/net/rds/rds.h b/net/rds/rds.h index 7301b9b01890e..24576bc4a5e93 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -356,6 +356,19 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 +struct rds_znotifier { + struct list_head z_list; + struct mmpin z_mmp; + u32 z_cookie; +}; + +#define RDS_ZCOPY_SKB(__skb) ((struct rds_znotifier *)&((__skb)->cb[0])) + +static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z) +{ + return container_of((void *)z, struct sk_buff, cb); +} + struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; @@ -436,6 +449,7 @@ struct rds_message { unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; + struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; diff --git a/net/rds/recv.c b/net/rds/recv.c index b25bcfe411ca6..b080961464df7 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -594,6 +594,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (msg_flags & MSG_OOB) goto out; + if (msg_flags & MSG_ERRQUEUE) + return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR); while (1) { /* If there are pending notifications, do those - and nothing else */ From 0cebaccef3acbdfbc2d85880a2efb765d2f4e2e3 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:36 -0800 Subject: [PATCH 5/7] rds: zerocopy Tx support. If the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and, if the SO_ZEROCOPY socket option has been set on the PF_RDS socket, application pages sent down with rds_sendmsg() are pinned. The pinning uses the accounting infrastructure added by Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages") The payload bytes in the message may not be modified for the duration that the message has been pinned. A multi-threaded application using this infrastructure may thus need to be notified about send-completion so that it can free/reuse the buffers passed to rds_sendmsg(). Notification of send-completion will identify each message-buffer by a cookie that the application must specify as ancillary data to rds_sendmsg(). The ancillary data in this case has cmsg_level == SOL_RDS and cmsg_type == RDS_CMSG_ZCOPY_COOKIE. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 1 + net/rds/message.c | 51 +++++++++++++++++++++++++++++++++++++++- net/rds/rds.h | 3 ++- net/rds/send.c | 44 +++++++++++++++++++++++++++++----- 4 files changed, 91 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index e71d4491f225f..12e3bca32cadb 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -103,6 +103,7 @@ #define RDS_CMSG_MASKED_ATOMIC_FADD 8 #define RDS_CMSG_MASKED_ATOMIC_CSWP 9 #define RDS_CMSG_RXPATH_LATENCY 11 +#define RDS_CMSG_ZCOPY_COOKIE 12 #define RDS_INFO_FIRST 10000 #define RDS_INFO_COUNTERS 10000 diff --git a/net/rds/message.c b/net/rds/message.c index bf1a656b198a0..651834513481a 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -341,12 +341,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy) { unsigned long to_copy, nbytes; unsigned long sg_off; struct scatterlist *sg; int ret = 0; + int length = iov_iter_count(from); rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); @@ -356,6 +358,53 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) sg = rm->data.op_sg; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + if (zcopy) { + int total_copied = 0; + struct sk_buff *skb; + + skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32), + GFP_KERNEL); + if (!skb) + return -ENOMEM; + rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); + if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, + length)) { + ret = -ENOMEM; + goto err; + } + while (iov_iter_count(from)) { + struct page *pages; + size_t start; + ssize_t copied; + + copied = iov_iter_get_pages(from, &pages, PAGE_SIZE, + 1, &start); + if (copied < 0) { + struct mmpin *mmp; + int i; + + for (i = 0; i < rm->data.op_nents; i++) + put_page(sg_page(&rm->data.op_sg[i])); + mmp = &rm->data.op_mmp_znotifier->z_mmp; + mm_unaccount_pinned_pages(mmp); + ret = -EFAULT; + goto err; + } + total_copied += copied; + iov_iter_advance(from, copied); + length -= copied; + sg_set_page(sg, pages, copied, start); + rm->data.op_nents++; + sg++; + } + WARN_ON_ONCE(length != 0); + return ret; +err: + consume_skb(skb); + rm->data.op_mmp_znotifier = NULL; + return ret; + } /* zcopy */ + while (iov_iter_count(from)) { if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, iov_iter_count(from), diff --git a/net/rds/rds.h b/net/rds/rds.h index 24576bc4a5e93..31cd388520502 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -785,7 +785,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from); +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); diff --git a/net/rds/send.c b/net/rds/send.c index e8f3ff471b15b..028ab598ac1b7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -875,12 +875,13 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ -static int rds_rm_size(struct msghdr *msg, int data_len) +static int rds_rm_size(struct msghdr *msg, int num_sgs) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; + bool zcopy_cookie = false; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -899,6 +900,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len) break; + case RDS_CMSG_ZCOPY_COOKIE: + zcopy_cookie = true; case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; @@ -919,7 +922,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len) } - size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) + return -EINVAL; + + size += num_sgs * sizeof(struct scatterlist); /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ if (cmsg_groups == 3) @@ -928,6 +934,18 @@ static int rds_rm_size(struct msghdr *msg, int data_len) return size; } +static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + u32 *cookie; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie))) + return -EINVAL; + cookie = CMSG_DATA(cmsg); + rm->data.op_mmp_znotifier->z_cookie = *cookie; + return 0; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -970,6 +988,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ret = rds_cmsg_atomic(rs, rm, cmsg); break; + case RDS_CMSG_ZCOPY_COOKIE: + ret = rds_cmsg_zcopy(rs, rm, cmsg); + break; + default: return -EINVAL; } @@ -1040,10 +1062,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; size_t total_payload_len = payload_len, rdma_payload_len = 0; + bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && + sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); + int num_sgs = ceil(payload_len, PAGE_SIZE); /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ - if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { ret = -EOPNOTSUPP; goto out; } @@ -1087,8 +1112,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } + if (zcopy) { + if (rs->rs_transport->t_type != RDS_TRANS_TCP) { + ret = -EOPNOTSUPP; + goto out; + } + num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); + } /* size of rm including all sgs */ - ret = rds_rm_size(msg, payload_len); + ret = rds_rm_size(msg, num_sgs); if (ret < 0) goto out; @@ -1100,12 +1132,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Attach data to the rm */ if (payload_len) { - rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (!rm->data.op_sg) { ret = -ENOMEM; goto out; } - ret = rds_message_copy_from_user(rm, &msg->msg_iter); + ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; } From b16ac920403e5623150e428e7f5429e8eb607308 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:37 -0800 Subject: [PATCH 6/7] selftests/net: add support for PF_RDS sockets Add support for basic PF_RDS client-server testing in msg_zerocopy Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/msg_zerocopy.c | 65 +++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index e11fe84de0fd9..7a5b35362f848 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -14,6 +14,9 @@ * - SOCK_DGRAM * - SOCK_RAW * + * PF_RDS + * - SOCK_SEQPACKET + * * Start this program on two connected hosts, one in send mode and * the other with option '-r' to put it in receiver mode. * @@ -53,6 +56,7 @@ #include #include #include +#include #ifndef SO_EE_ORIGIN_ZEROCOPY #define SO_EE_ORIGIN_ZEROCOPY 5 @@ -300,10 +304,15 @@ static int do_setup_tx(int domain, int type, int protocol) if (cfg_zerocopy) do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1); - if (domain != PF_PACKET) + if (domain != PF_PACKET && domain != PF_RDS) if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) error(1, errno, "connect"); + if (domain == PF_RDS) { + if (bind(fd, (void *) &cfg_src_addr, cfg_alen)) + error(1, errno, "bind"); + } + return fd; } @@ -444,6 +453,13 @@ static void do_tx(int domain, int type, int protocol) msg.msg_iovlen++; } + if (domain == PF_RDS) { + msg.msg_name = &cfg_dst_addr; + msg.msg_namelen = (cfg_dst_addr.ss_family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + } + iov[2].iov_base = payload; iov[2].iov_len = cfg_payload_len; msg.msg_iovlen++; @@ -555,6 +571,40 @@ static void do_flush_datagram(int fd, int type) bytes += cfg_payload_len; } + +static void do_recvmsg(int fd) +{ + int ret, off = 0; + char *buf; + struct iovec iov; + struct msghdr msg; + struct sockaddr_storage din; + + buf = calloc(cfg_payload_len, sizeof(char)); + iov.iov_base = buf; + iov.iov_len = cfg_payload_len; + + memset(&msg, 0, sizeof(msg)); + msg.msg_name = &din; + msg.msg_namelen = sizeof(din); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + ret = recvmsg(fd, &msg, MSG_TRUNC); + + if (ret == -1) + error(1, errno, "recv"); + if (ret != cfg_payload_len) + error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len); + + if (memcmp(buf + off, payload, ret)) + error(1, 0, "recv: data mismatch"); + + free(buf); + packets++; + bytes += cfg_payload_len; +} + static void do_rx(int domain, int type, int protocol) { uint64_t tstop; @@ -566,6 +616,8 @@ static void do_rx(int domain, int type, int protocol) do { if (type == SOCK_STREAM) do_flush_tcp(fd); + else if (domain == PF_RDS) + do_recvmsg(fd); else do_flush_datagram(fd, type); @@ -610,6 +662,7 @@ static void parse_opts(int argc, char **argv) 40 /* max tcp options */; int c; char *daddr = NULL, *saddr = NULL; + char *cfg_test; cfg_payload_len = max_payload_len; @@ -667,6 +720,14 @@ static void parse_opts(int argc, char **argv) break; } } + + cfg_test = argv[argc - 1]; + if (strcmp(cfg_test, "rds") == 0) { + if (!daddr) + error(1, 0, "-D required for PF_RDS\n"); + if (!cfg_rx && !saddr) + error(1, 0, "-S required for PF_RDS\n"); + } setup_sockaddr(cfg_family, daddr, &cfg_dst_addr); setup_sockaddr(cfg_family, saddr, &cfg_src_addr); @@ -699,6 +760,8 @@ int main(int argc, char **argv) do_test(cfg_family, SOCK_STREAM, 0); else if (!strcmp(cfg_test, "udp")) do_test(cfg_family, SOCK_DGRAM, 0); + else if (!strcmp(cfg_test, "rds")) + do_test(PF_RDS, SOCK_SEQPACKET, 0); else error(1, 0, "unknown cfg_test %s", cfg_test); From dfb8434b0a94651dac7adf373900561d8f6499e4 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:38 -0800 Subject: [PATCH 7/7] selftests/net: add zerocopy support for PF_RDS test case Send a cookie with sendmsg() on PF_RDS sockets, and process the returned batched cookies in do_recv_completion() Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/msg_zerocopy.c | 68 ++++++++++++++++++++-- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index 7a5b35362f848..5cc2a53bb71cd 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -168,17 +168,39 @@ static int do_accept(int fd) return fd; } -static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy) +static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie) +{ + struct cmsghdr *cm; + + if (!msg->msg_control) + error(1, errno, "NULL cookie"); + cm = (void *)msg->msg_control; + cm->cmsg_len = CMSG_LEN(sizeof(cookie)); + cm->cmsg_level = SOL_RDS; + cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE; + memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie)); +} + +static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain) { int ret, len, i, flags; + static uint32_t cookie; + char ckbuf[CMSG_SPACE(sizeof(cookie))]; len = 0; for (i = 0; i < msg->msg_iovlen; i++) len += msg->msg_iov[i].iov_len; flags = MSG_DONTWAIT; - if (do_zerocopy) + if (do_zerocopy) { flags |= MSG_ZEROCOPY; + if (domain == PF_RDS) { + memset(&msg->msg_control, 0, sizeof(msg->msg_control)); + msg->msg_controllen = CMSG_SPACE(sizeof(cookie)); + msg->msg_control = (struct cmsghdr *)ckbuf; + add_zcopy_cookie(msg, ++cookie); + } + } ret = sendmsg(fd, msg, flags); if (ret == -1 && errno == EAGAIN) @@ -194,6 +216,10 @@ static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy) if (do_zerocopy && ret) expected_completions++; } + if (do_zerocopy && domain == PF_RDS) { + msg->msg_control = NULL; + msg->msg_controllen = 0; + } return true; } @@ -220,7 +246,9 @@ static void do_sendmsg_corked(int fd, struct msghdr *msg) msg->msg_iov[0].iov_len = payload_len + extra_len; extra_len = 0; - do_sendmsg(fd, msg, do_zerocopy); + do_sendmsg(fd, msg, do_zerocopy, + (cfg_dst_addr.ss_family == AF_INET ? + PF_INET : PF_INET6)); } do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); @@ -316,6 +344,26 @@ static int do_setup_tx(int domain, int type, int protocol) return fd; } +static int do_process_zerocopy_cookies(struct sock_extended_err *serr, + uint32_t *ckbuf, size_t nbytes) +{ + int ncookies, i; + + if (serr->ee_errno != 0) + error(1, 0, "serr: wrong error code: %u", serr->ee_errno); + ncookies = serr->ee_data; + if (ncookies > SO_EE_ORIGIN_MAX_ZCOOKIES) + error(1, 0, "Returned %d cookies, max expected %d\n", + ncookies, SO_EE_ORIGIN_MAX_ZCOOKIES); + if (nbytes != ncookies * sizeof(uint32_t)) + error(1, 0, "Expected %d cookies, got %ld\n", + ncookies, nbytes/sizeof(uint32_t)); + for (i = 0; i < ncookies; i++) + if (cfg_verbose >= 2) + fprintf(stderr, "%d\n", ckbuf[i]); + return ncookies; +} + static bool do_recv_completion(int fd) { struct sock_extended_err *serr; @@ -324,10 +372,17 @@ static bool do_recv_completion(int fd) uint32_t hi, lo, range; int ret, zerocopy; char control[100]; + uint32_t ckbuf[SO_EE_ORIGIN_MAX_ZCOOKIES]; + struct iovec iov; msg.msg_control = control; msg.msg_controllen = sizeof(control); + iov.iov_base = ckbuf; + iov.iov_len = (SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(ckbuf[0])); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); if (ret == -1 && errno == EAGAIN) return false; @@ -346,6 +401,11 @@ static bool do_recv_completion(int fd) cm->cmsg_level, cm->cmsg_type); serr = (void *) CMSG_DATA(cm); + + if (serr->ee_origin == SO_EE_ORIGIN_ZCOOKIE) { + completions += do_process_zerocopy_cookies(serr, ckbuf, ret); + return true; + } if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) error(1, 0, "serr: wrong origin: %u", serr->ee_origin); if (serr->ee_errno != 0) @@ -470,7 +530,7 @@ static void do_tx(int domain, int type, int protocol) if (cfg_cork) do_sendmsg_corked(fd, &msg); else - do_sendmsg(fd, &msg, cfg_zerocopy); + do_sendmsg(fd, &msg, cfg_zerocopy, domain); while (!do_poll(fd, POLLOUT)) { if (cfg_zerocopy)