Skip to content

Commit

Permalink
tcp: implement mmap() for zero copy receive
Browse files Browse the repository at this point in the history
Some networks can make sure TCP payload can exactly fit 4KB pages,
with well chosen MSS/MTU and architectures.

Implement mmap() system call so that applications can avoid
copying data without complex splice() games.

Note that a successful mmap( X bytes) on TCP socket is consuming
bytes, as if recvmsg() has been done. (tp->copied += X)

Only PROT_READ mappings are accepted, as skb page frags
are fundamentally shared and read only.

If tcp_mmap() finds data that is not a full page, or a patch of
urgent data, -EINVAL is returned, no bytes are consumed.

Application must fallback to recvmsg() to read the problematic sequence.

mmap() wont block,  regardless of socket being in blocking or
non-blocking mode. If not enough bytes are in receive queue,
mmap() would return -EAGAIN, or -EIO if socket is in a state
where no other bytes can be added into receive queue.

An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD)
to efficiently use mmap()

On the sender side, MSG_EOR might help to clearly separate unaligned
headers and 4K-aligned chunks if necessary.

Tested:

mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch.
MTU set to 4168  (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header)

Without mmap() (tcp_mmap -s)

received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit,
  cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches
received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit,
  cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches
received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit,
  cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches
received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit,
  cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches

With mmap() on receiver (tcp_mmap -s -z)

received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit,
  cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches
received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit,
  cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches
received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit,
  cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches
received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit,
  cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eric Dumazet authored and David S. Miller committed Apr 16, 2018
1 parent 03f45c8 commit 93ab6cc
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 2 deletions.
2 changes: 2 additions & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);
int tcp_set_rcvlowat(struct sock *sk, int val);
void tcp_data_ready(struct sock *sk);
int tcp_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma);
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
struct tcp_options_received *opt_rx,
int estab, struct tcp_fastopen_cookie *foc);
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/af_inet.c
Original file line number Diff line number Diff line change
Expand Up @@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = {
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.mmap = tcp_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
Expand Down
113 changes: 113 additions & 0 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
}
EXPORT_SYMBOL(tcp_set_rcvlowat);

/* When user wants to mmap X pages, we first need to perform the mapping
* before freeing any skbs in receive queue, otherwise user would be unable
* to fallback to standard recvmsg(). This happens if some data in the
* requested block is not exactly fitting in a page.
*
* We only support order-0 pages for the moment.
* mmap() on TCP is very strict, there is no point
* trying to accommodate with pathological layouts.
*/
int tcp_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
unsigned long size = vma->vm_end - vma->vm_start;
unsigned int nr_pages = size >> PAGE_SHIFT;
struct page **pages_array = NULL;
u32 seq, len, offset, nr = 0;
struct sock *sk = sock->sk;
const skb_frag_t *frags;
struct tcp_sock *tp;
struct sk_buff *skb;
int ret;

if (vma->vm_pgoff || !nr_pages)
return -EINVAL;

if (vma->vm_flags & VM_WRITE)
return -EPERM;
/* TODO: Maybe the following is not needed if pages are COW */
vma->vm_flags &= ~VM_MAYWRITE;

lock_sock(sk);

ret = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;

sock_rps_record_flow(sk);

if (tcp_inq(sk) < size) {
ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
goto out;
}
tp = tcp_sk(sk);
seq = tp->copied_seq;
/* Abort if urgent data is in the area */
if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq;

ret = -EINVAL;
if (urg_offset < size)
goto out;
}
ret = -ENOMEM;
pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
GFP_KERNEL);
if (!pages_array)
goto out;
skb = tcp_recv_skb(sk, seq, &offset);
ret = -EINVAL;
skb_start:
/* We do not support anything not in page frags */
offset -= skb_headlen(skb);
if ((int)offset < 0)
goto out;
if (skb_has_frag_list(skb))
goto out;
len = skb->data_len - offset;
frags = skb_shinfo(skb)->frags;
while (offset) {
if (frags->size > offset)
goto out;
offset -= frags->size;
frags++;
}
while (nr < nr_pages) {
if (len) {
if (len < PAGE_SIZE)
goto out;
if (frags->size != PAGE_SIZE || frags->page_offset)
goto out;
pages_array[nr++] = skb_frag_page(frags);
frags++;
len -= PAGE_SIZE;
seq += PAGE_SIZE;
continue;
}
skb = skb->next;
offset = seq - TCP_SKB_CB(skb)->seq;
goto skb_start;
}
/* OK, we have a full set of pages ready to be inserted into vma */
for (nr = 0; nr < nr_pages; nr++) {
ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
pages_array[nr]);
if (ret)
goto out;
}
/* operation is complete, we can 'consume' all skbs */
tp->copied_seq = seq;
tcp_rcv_space_adjust(sk);

/* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, size);

ret = 0;
out:
release_sock(sk);
kvfree(pages_array);
return ret;
}
EXPORT_SYMBOL(tcp_mmap);

static void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping *tss)
{
Expand Down
2 changes: 1 addition & 1 deletion net/ipv6/af_inet6.c
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = {
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet_sendmsg, /* ok */
.recvmsg = inet_recvmsg, /* ok */
.mmap = sock_no_mmap,
.mmap = tcp_mmap,
.sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
Expand Down

0 comments on commit 93ab6cc

Please sign in to comment.