From 0d89419319ef68f8acc1b78377ad3e7523fead4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Aug 2020 10:11:16 -0700 Subject: [PATCH 1/3] selftests: net: tcp_mmap: use madvise(MADV_DONTNEED) When TCP_ZEROCOPY_RECEIVE operation has been added, I made the mistake of automatically un-mapping prior content before mapping new pages. This has the unfortunate effect of adding potentially long MMU operations (like TLB flushes) while socket lock is held. Using madvise(MADV_DONTNEED) right after pages has been used has two benefits : 1) This releases pages sooner, allowing pages to be recycled if they were part of a page pool in a NIC driver. 2) No more long unmap operations while preventing immediate processing of incoming packets. The cost of the added system call is small enough. Arjun will submit a kernel patch allowing to opt out from the unmap attempt in tcp_zerocopy_receive() Signed-off-by: Eric Dumazet Cc: Arjun Roy Cc: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- tools/testing/selftests/net/tcp_mmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c index a61b7b3da5496..59ec0b59f7b76 100644 --- a/tools/testing/selftests/net/tcp_mmap.c +++ b/tools/testing/selftests/net/tcp_mmap.c @@ -179,6 +179,10 @@ void *child_thread(void *arg) total_mmap += zc.length; if (xflg) hash_zone(addr, zc.length); + /* It is more efficient to unmap the pages right now, + * instead of doing this in next TCP_ZEROCOPY_RECEIVE. + */ + madvise(addr, zc.length, MADV_DONTNEED); total += zc.length; } if (zc.recv_skip_hint) { From 72653ae5303c626ca29fcbcbb8165a894a104adf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Aug 2020 10:11:17 -0700 Subject: [PATCH 2/3] selftests: net: tcp_mmap: Use huge pages in send path There are significant gains using huge pages when available, as shown in [1]. This patch adds mmap_large_buffer() and uses it in client side (tx path of this reference tool) Following patch will use the feature for server side. [1] https://patchwork.ozlabs.org/project/netdev/patch/20200820154359.1806305-1-edumazet@google.com/ Signed-off-by: Eric Dumazet Cc: Arjun Roy Cc: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- tools/testing/selftests/net/tcp_mmap.c | 29 +++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c index 59ec0b59f7b76..ca2618f3e7a12 100644 --- a/tools/testing/selftests/net/tcp_mmap.c +++ b/tools/testing/selftests/net/tcp_mmap.c @@ -123,6 +123,28 @@ void hash_zone(void *zone, unsigned int length) #define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) #define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) + +static void *mmap_large_buffer(size_t need, size_t *allocated) +{ + void *buffer; + size_t sz; + + /* Attempt to use huge pages if possible. */ + sz = ALIGN_UP(need, map_align); + buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + + if (buffer == (void *)-1) { + sz = need; + buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buffer != (void *)-1) + fprintf(stderr, "MAP_HUGETLB attempt failed, look at /sys/kernel/mm/hugepages for optimal performance\n"); + } + *allocated = sz; + return buffer; +} + void *child_thread(void *arg) { unsigned long total_mmap = 0, total = 0; @@ -351,6 +373,7 @@ int main(int argc, char *argv[]) uint64_t total = 0; char *host = NULL; int fd, c, on = 1; + size_t buffer_sz; char *buffer; int sflg = 0; int mss = 0; @@ -441,8 +464,8 @@ int main(int argc, char *argv[]) } do_accept(fdlisten); } - buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + buffer = mmap_large_buffer(chunk_size, &buffer_sz); if (buffer == (char *)-1) { perror("mmap"); exit(1); @@ -488,6 +511,6 @@ int main(int argc, char *argv[]) total += wr; } close(fd); - munmap(buffer, chunk_size); + munmap(buffer, buffer_sz); return 0; } From 59c0d31988fb366189502a8ac66b7fe1486b7e40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Aug 2020 10:11:18 -0700 Subject: [PATCH 3/3] selftests: net: tcp_mmap: Use huge pages in receive path One down side of using TCP rx zerocopy is one extra TLB miss per page after the mapping operation. While if the application is using hugepages, the non zerocopy recvmsg() will not have to pay these TLB costs. This patch allows server side to use huge pages for the non zero copy case, to allow fair comparisons when both solutions use optimal conditions. Signed-off-by: Eric Dumazet Cc: Arjun Roy Cc: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- tools/testing/selftests/net/tcp_mmap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c index ca2618f3e7a12..00f837c9bc6c4 100644 --- a/tools/testing/selftests/net/tcp_mmap.c +++ b/tools/testing/selftests/net/tcp_mmap.c @@ -157,6 +157,7 @@ void *child_thread(void *arg) void *addr = NULL; double throughput; struct rusage ru; + size_t buffer_sz; int lu, fd; fd = (int)(unsigned long)arg; @@ -164,9 +165,9 @@ void *child_thread(void *arg) gettimeofday(&t0, NULL); fcntl(fd, F_SETFL, O_NDELAY); - buffer = malloc(chunk_size); - if (!buffer) { - perror("malloc"); + buffer = mmap_large_buffer(chunk_size, &buffer_sz); + if (buffer == (void *)-1) { + perror("mmap"); goto error; } if (zflg) { @@ -256,7 +257,7 @@ void *child_thread(void *arg) ru.ru_nvcsw); } error: - free(buffer); + munmap(buffer, buffer_sz); close(fd); if (zflg) munmap(raddr, chunk_size + map_align);