Skip to content

Commit

Permalink
Merge branch 'splice-net-replace-sendpage-with-sendmsg-msg_splice_pag…
Browse files Browse the repository at this point in the history
…es-part-1'

David Howells says:

====================
splice, net: Replace sendpage with sendmsg(MSG_SPLICE_PAGES), part 1

Here's the first tranche of patches towards providing a MSG_SPLICE_PAGES
internal sendmsg flag that is intended to replace the ->sendpage() op with
calls to sendmsg().  MSG_SPLICE_PAGES is a hint that tells the protocol
that it should splice the pages supplied if it can and copy them if not.

This will allow splice to pass multiple pages in a single call and allow
certain parts of higher protocols (e.g. sunrpc, iwarp) to pass an entire
message in one go rather than having to send them piecemeal.  This should
also make it easier to handle the splicing of multipage folios.

A helper, skb_splice_from_iter() is provided to do the work of splicing or
copying data from an iterator.  If a page is determined to be unspliceable
(such as being in the slab), then the helper will give an error.

Note that this facility is not made available to userspace and does not
provide any sort of callback.

This set consists of the following parts:

 (1) Define the MSG_SPLICE_PAGES flag and prevent sys_sendmsg() from being
     able to set it.

 (2) Add an extra argument to skb_append_pagefrags() so that something
     other than MAX_SKB_FRAGS can be used (sysctl_max_skb_frags for
     example).

 (3) Add the skb_splice_from_iter() helper to handle splicing pages into
     skbuffs for MSG_SPLICE_PAGES that can be shared by TCP, IP/UDP and
     AF_UNIX.

 (4) Implement MSG_SPLICE_PAGES support in TCP.

 (5) Make do_tcp_sendpages() just wrap sendmsg() and then fold it in to its
     various callers.

 (6) Implement MSG_SPLICE_PAGES support in IP and make udp_sendpage() just
     a wrapper around sendmsg().

 (7) Implement MSG_SPLICE_PAGES support in IP6/UDP6.

 (8) Implement MSG_SPLICE_PAGES support in AF_UNIX.

 (9) Make AF_UNIX copy unspliceable pages.

Link: https://lore.kernel.org/r/20230316152618.711970-1-dhowells@redhat.com/ # v1
Link: https://lore.kernel.org/r/20230329141354.516864-1-dhowells@redhat.com/ # v2
Link: https://lore.kernel.org/r/20230331160914.1608208-1-dhowells@redhat.com/ # v3
Link: https://lore.kernel.org/r/20230405165339.3468808-1-dhowells@redhat.com/ # v4
Link: https://lore.kernel.org/r/20230406094245.3633290-1-dhowells@redhat.com/ # v5
Link: https://lore.kernel.org/r/20230411160902.4134381-1-dhowells@redhat.com/ # v6
Link: https://lore.kernel.org/r/20230515093345.396978-1-dhowells@redhat.com/ # v7
Link: https://lore.kernel.org/r/20230518113453.1350757-1-dhowells@redhat.com/ # v8
====================

Link: https://lore.kernel.org/r/20230522121125.2595254-1-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed May 24, 2023
2 parents 57910a4 + 57d44a3 commit 51c78a4
Show file tree
Hide file tree
Showing 17 changed files with 278 additions and 532 deletions.
17 changes: 12 additions & 5 deletions drivers/infiniband/sw/siw/siw_qp_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
}

/*
* 0copy TCP transmit interface: Use do_tcp_sendpages.
* 0copy TCP transmit interface: Use MSG_SPLICE_PAGES.
*
* Using sendpage to push page by page appears to be less efficient
* than using sendmsg, even if data are copied.
Expand All @@ -323,20 +323,27 @@ static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
size_t size)
{
struct bio_vec bvec;
struct msghdr msg = {
.msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST |
MSG_SPLICE_PAGES),
};
struct sock *sk = s->sk;
int i = 0, rv = 0, sent = 0,
flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST;
int i = 0, rv = 0, sent = 0;

while (size) {
size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);

if (size + offset <= PAGE_SIZE)
flags = MSG_MORE | MSG_DONTWAIT;
msg.msg_flags &= ~MSG_SENDPAGE_NOTLAST;

tcp_rate_check_app_limited(sk);
bvec_set_page(&bvec, page[i], bytes, offset);
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);

try_page_again:
lock_sock(sk);
rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags);
rv = tcp_sendmsg_locked(sk, &msg, size);
release_sock(sk);

if (rv > 0) {
Expand Down
5 changes: 4 additions & 1 deletion include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -1383,7 +1383,7 @@ static inline int skb_pad(struct sk_buff *skb, int pad)
#define dev_kfree_skb(a) consume_skb(a)

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
int offset, size_t size);
int offset, size_t size, size_t max_frags);

struct skb_seq_state {
__u32 lower_offset;
Expand Down Expand Up @@ -5097,5 +5097,8 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
#endif
}

ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
ssize_t maxsize, gfp_t gfp);

#endif /* __KERNEL__ */
#endif /* _LINUX_SKBUFF_H */
3 changes: 3 additions & 0 deletions include/linux/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ struct ucred {
*/

#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */
#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file
descriptor received through
Expand All @@ -337,6 +338,8 @@ struct ucred {
#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */
#endif

/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */
#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES)

/* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
#define SOL_IP 0
Expand Down
2 changes: 0 additions & 2 deletions include/net/ip.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,6 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
unsigned int flags);
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd,
struct sk_buff *skb);
ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
int offset, size_t size, int flags);
struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4,
struct sk_buff_head *queue,
struct inet_cork *cork);
Expand Down
2 changes: 0 additions & 2 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,6 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
int size_goal);
Expand Down
2 changes: 1 addition & 1 deletion include/net/tls.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ struct tls_context {
struct scatterlist *partially_sent_record;
u16 partially_sent_offset;

bool in_tcp_sendpages;
bool splicing_pages;
bool pending_open_record_frags;

struct mutex tx_lock; /* protects partially_sent_* fields and
Expand Down
2 changes: 2 additions & 0 deletions io_uring/net.c
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter);

flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
msg.msg_flags = flags;
ret = sock_sendmsg(sock, &msg);
if (ret < min_ret) {
Expand Down Expand Up @@ -1136,6 +1137,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
msg_flags |= MSG_DONTWAIT;
if (msg_flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter);
msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;

msg.msg_flags = msg_flags;
msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
Expand Down
92 changes: 90 additions & 2 deletions net/core/skbuff.c
Original file line number Diff line number Diff line change
Expand Up @@ -4188,13 +4188,13 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
EXPORT_SYMBOL(skb_find_text);

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
int offset, size_t size)
int offset, size_t size, size_t max_frags)
{
int i = skb_shinfo(skb)->nr_frags;

if (skb_can_coalesce(skb, i, page, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
} else if (i < MAX_SKB_FRAGS) {
} else if (i < max_frags) {
skb_zcopy_downgrade_managed(skb);
get_page(page);
skb_fill_page_desc_noacc(skb, i, page, offset, size);
Expand Down Expand Up @@ -6892,3 +6892,91 @@ nodefer: __kfree_skb(skb);
if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
smp_call_function_single_async(cpu, &sd->defer_csd);
}

static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
size_t offset, size_t len)
{
const char *kaddr;
__wsum csum;

kaddr = kmap_local_page(page);
csum = csum_partial(kaddr + offset, len, 0);
kunmap_local(kaddr);
skb->csum = csum_block_add(skb->csum, csum, skb->len);
}

/**
* skb_splice_from_iter - Splice (or copy) pages to skbuff
* @skb: The buffer to add pages to
* @iter: Iterator representing the pages to be added
* @maxsize: Maximum amount of pages to be added
* @gfp: Allocation flags
*
* This is a common helper function for supporting MSG_SPLICE_PAGES. It
* extracts pages from an iterator and adds them to the socket buffer if
* possible, copying them to fragments if not possible (such as if they're slab
* pages).
*
* Returns the amount of data spliced/copied or -EMSGSIZE if there's
* insufficient space in the buffer to transfer anything.
*/
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
ssize_t maxsize, gfp_t gfp)
{
size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
struct page *pages[8], **ppages = pages;
ssize_t spliced = 0, ret = 0;
unsigned int i;

while (iter->count > 0) {
ssize_t space, nr;
size_t off, len;

ret = -EMSGSIZE;
space = frag_limit - skb_shinfo(skb)->nr_frags;
if (space < 0)
break;

/* We might be able to coalesce without increasing nr_frags */
nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));

len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
if (len <= 0) {
ret = len ?: -EIO;
break;
}

i = 0;
do {
struct page *page = pages[i++];
size_t part = min_t(size_t, PAGE_SIZE - off, len);

ret = -EIO;
if (WARN_ON_ONCE(!sendpage_ok(page)))
goto out;

ret = skb_append_pagefrags(skb, page, off, part,
frag_limit);
if (ret < 0) {
iov_iter_revert(iter, len);
goto out;
}

if (skb->ip_summed == CHECKSUM_NONE)
skb_splice_csum_page(skb, page, off, part);

off = 0;
spliced += part;
maxsize -= part;
len -= part;
} while (len > 0);

if (maxsize <= 0)
break;
}

out:
skb_len_add(skb, spliced);
return spliced ?: ret;
}
EXPORT_SYMBOL(skb_splice_from_iter);
Loading

0 comments on commit 51c78a4

Please sign in to comment.