Skip to content

Commit

Permalink
Merge branch 'tcp-zero-copy-receive'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: add zero copy receive

This patch series add mmap() support to TCP sockets for RX zero copy.

While tcp_mmap() patch itself is quite small (~100 LOC), optimal support
for asynchronous mmap() required better SO_RCVLOWAT behavior, and a
test program to demonstrate how mmap() on TCP sockets can be used.

Note that mmap() (and associated munmap()) calls are adding more
pressure on per-process VM semaphore, so might not show benefit
for processus with high number of threads.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Apr 16, 2018
2 parents 10b19ae + 192dc40 commit 309c446
Show file tree
Hide file tree
Showing 9 changed files with 608 additions and 7 deletions.
1 change: 1 addition & 0 deletions include/linux/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ struct proto_ops {
int offset, size_t size, int flags);
int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
size_t size);
int (*set_rcvlowat)(struct sock *sk, int val);
};

#define DECLARE_SOCKADDR(type, dst, src) \
Expand Down
4 changes: 4 additions & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,10 @@ void tcp_set_keepalive(struct sock *sk, int val);
void tcp_syn_ack_timeout(const struct request_sock *req);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);
int tcp_set_rcvlowat(struct sock *sk, int val);
void tcp_data_ready(struct sock *sk);
int tcp_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma);
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
struct tcp_options_received *opt_rx,
int estab, struct tcp_fastopen_cookie *foc);
Expand Down
5 changes: 4 additions & 1 deletion net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -905,7 +905,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
sk->sk_rcvlowat = val ? : 1;
if (sock->ops->set_rcvlowat)
ret = sock->ops->set_rcvlowat(sk, val);
else
sk->sk_rcvlowat = val ? : 1;
break;

case SO_RCVTIMEO:
Expand Down
3 changes: 2 additions & 1 deletion net/ipv4/af_inet.c
Original file line number Diff line number Diff line change
Expand Up @@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = {
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.mmap = tcp_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
Expand All @@ -1006,6 +1006,7 @@ const struct proto_ops inet_stream_ops = {
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
.set_rcvlowat = tcp_set_rcvlowat,
};
EXPORT_SYMBOL(inet_stream_ops);

Expand Down
138 changes: 138 additions & 0 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,144 @@ int tcp_peek_len(struct socket *sock)
}
EXPORT_SYMBOL(tcp_peek_len);

/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
int tcp_set_rcvlowat(struct sock *sk, int val)
{
sk->sk_rcvlowat = val ? : 1;

/* Check if we need to signal EPOLLIN right now */
tcp_data_ready(sk);

if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
return 0;

/* val comes from user space and might be close to INT_MAX */
val <<= 1;
if (val < 0)
val = INT_MAX;

val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (val > sk->sk_rcvbuf) {
sk->sk_rcvbuf = val;
tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
}
return 0;
}
EXPORT_SYMBOL(tcp_set_rcvlowat);

/* When user wants to mmap X pages, we first need to perform the mapping
* before freeing any skbs in receive queue, otherwise user would be unable
* to fallback to standard recvmsg(). This happens if some data in the
* requested block is not exactly fitting in a page.
*
* We only support order-0 pages for the moment.
* mmap() on TCP is very strict, there is no point
* trying to accommodate with pathological layouts.
*/
int tcp_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
unsigned long size = vma->vm_end - vma->vm_start;
unsigned int nr_pages = size >> PAGE_SHIFT;
struct page **pages_array = NULL;
u32 seq, len, offset, nr = 0;
struct sock *sk = sock->sk;
const skb_frag_t *frags;
struct tcp_sock *tp;
struct sk_buff *skb;
int ret;

if (vma->vm_pgoff || !nr_pages)
return -EINVAL;

if (vma->vm_flags & VM_WRITE)
return -EPERM;
/* TODO: Maybe the following is not needed if pages are COW */
vma->vm_flags &= ~VM_MAYWRITE;

lock_sock(sk);

ret = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;

sock_rps_record_flow(sk);

if (tcp_inq(sk) < size) {
ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
goto out;
}
tp = tcp_sk(sk);
seq = tp->copied_seq;
/* Abort if urgent data is in the area */
if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq;

ret = -EINVAL;
if (urg_offset < size)
goto out;
}
ret = -ENOMEM;
pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
GFP_KERNEL);
if (!pages_array)
goto out;
skb = tcp_recv_skb(sk, seq, &offset);
ret = -EINVAL;
skb_start:
/* We do not support anything not in page frags */
offset -= skb_headlen(skb);
if ((int)offset < 0)
goto out;
if (skb_has_frag_list(skb))
goto out;
len = skb->data_len - offset;
frags = skb_shinfo(skb)->frags;
while (offset) {
if (frags->size > offset)
goto out;
offset -= frags->size;
frags++;
}
while (nr < nr_pages) {
if (len) {
if (len < PAGE_SIZE)
goto out;
if (frags->size != PAGE_SIZE || frags->page_offset)
goto out;
pages_array[nr++] = skb_frag_page(frags);
frags++;
len -= PAGE_SIZE;
seq += PAGE_SIZE;
continue;
}
skb = skb->next;
offset = seq - TCP_SKB_CB(skb)->seq;
goto skb_start;
}
/* OK, we have a full set of pages ready to be inserted into vma */
for (nr = 0; nr < nr_pages; nr++) {
ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
pages_array[nr]);
if (ret)
goto out;
}
/* operation is complete, we can 'consume' all skbs */
tp->copied_seq = seq;
tcp_rcv_space_adjust(sk);

/* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, size);

ret = 0;
out:
release_sock(sk);
kvfree(pages_array);
return ret;
}
EXPORT_SYMBOL(tcp_mmap);

static void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping *tss)
{
Expand Down
22 changes: 18 additions & 4 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -4576,6 +4576,17 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)

}

void tcp_data_ready(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
int avail = tp->rcv_nxt - tp->copied_seq;

if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
return;

sk->sk_data_ready(sk);
}

static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
Expand Down Expand Up @@ -4633,7 +4644,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk);
tcp_data_ready(sk);
return;
}

Expand Down Expand Up @@ -5026,9 +5037,12 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise). Or...
* (tcp_recvmsg() will send ACK otherwise).
* If application uses SO_RCVLOWAT, we want send ack now if
* we have not received enough bytes to satisfy the condition.
*/
__tcp_select_window(sk) >= tp->rcv_wnd) ||
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
__tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
Expand Down Expand Up @@ -5431,7 +5445,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
sk->sk_data_ready(sk);
tcp_data_ready(sk);
return;
}
}
Expand Down
3 changes: 2 additions & 1 deletion net/ipv6/af_inet6.c
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = {
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet_sendmsg, /* ok */
.recvmsg = inet_recvmsg, /* ok */
.mmap = sock_no_mmap,
.mmap = tcp_mmap,
.sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
Expand All @@ -590,6 +590,7 @@ const struct proto_ops inet6_stream_ops = {
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
.set_rcvlowat = tcp_set_rcvlowat,
};

const struct proto_ops inet6_dgram_ops = {
Expand Down
2 changes: 2 additions & 0 deletions tools/testing/selftests/net/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetl
TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh
TEST_GEN_FILES = socket
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
TEST_GEN_FILES += tcp_mmap
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict

include ../lib.mk

$(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma
$(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread
Loading

0 comments on commit 309c446

Please sign in to comment.