From 545cd5e5ec5477c325e4098b6fd21213dceda408 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:07:53 -0700 Subject: [PATCH 1/8] net: Busy polling should ignore sender CPUs This patch is a cleanup/fix for NAPI IDs following the changes that made it so that sender_cpu and napi_id were doing a better job of sharing the same location in the sk_buff. One issue I found is that we weren't validating the napi_id as being valid before we started trying to setup the busy polling. This change corrects that by using the MIN_NAPI_ID value that is now used in both allocating the NAPI IDs, as well as validating them. Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/busy_poll.h | 9 +++++++-- net/core/dev.c | 13 +++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c0452de83086e..3fcda9e70c3f7 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -35,6 +35,12 @@ struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; +/* 0 - Reserved to indicate value not set + * 1..NR_CPUS - Reserved for sender_cpu + * NR_CPUS+1..~0 - Region available for NAPI IDs + */ +#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) + static inline bool net_busy_loop_on(void) { return sysctl_net_busy_poll; @@ -58,10 +64,9 @@ static inline unsigned long busy_loop_end_time(void) static inline bool sk_can_busy_loop(const struct sock *sk) { - return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current); + return sk->sk_ll_usec && !signal_pending(current); } - static inline bool busy_loop_timeout(unsigned long end_time) { unsigned long now = busy_loop_us_clock(); diff --git a/net/core/dev.c b/net/core/dev.c index 7869ae3837ca7..ab337bf5bbf43 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5066,15 +5066,20 @@ bool sk_busy_loop(struct sock *sk, int nonblock) int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; + unsigned int napi_id; int rc; restart: + napi_id = READ_ONCE(sk->sk_napi_id); + if (napi_id < MIN_NAPI_ID) + return 0; + rc = false; napi_poll = NULL; rcu_read_lock(); - napi = napi_by_id(sk->sk_napi_id); + napi = napi_by_id(napi_id); if (!napi) goto out; @@ -5143,10 +5148,10 @@ static void napi_hash_add(struct napi_struct *napi) spin_lock(&napi_hash_lock); - /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < NR_CPUS + 1)) - napi_gen_id = NR_CPUS + 1; + if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); napi->napi_id = napi_gen_id; From e5907459ce7e2b6bc397007865ad492f10c2aeac Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:08:00 -0700 Subject: [PATCH 2/8] tcp: Record Rx hash and NAPI ID in tcp_child_process While working on some recent busy poll changes we found that child sockets were being instantiated without NAPI ID being set. In our first attempt to fix it, it was suggested that we should just pull programming the NAPI ID into the function itself since all callers will need to have it set. In addition to the NAPI ID change I have dropped the code that was populating the Rx hash since it was actually being populated in tcp_get_cookie_sock. Reported-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_minisocks.c | 4 ++++ net/ipv6/tcp_ipv6.c | 2 -- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7482b5d118619..20cbd2f07f281 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1409,8 +1409,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (!nsk) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1e217948be629..8f6373b0cd772 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -26,6 +26,7 @@ #include #include #include +#include int sysctl_tcp_abort_on_overflow __read_mostly; @@ -799,6 +800,9 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; + /* record NAPI ID of child */ + sk_mark_napi_id(child, skb); + tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 031a8c019f7a7..8e42e8f54b705 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1293,8 +1293,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) From d2e64dbbe95b2b51eb723134274de1d3f30bce80 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:08:06 -0700 Subject: [PATCH 3/8] net: Only define skb_mark_napi_id in one spot instead of two Instead of defining two versions of skb_mark_napi_id I think it is more readable to just match the format of the sk_mark_napi_id functions and just wrap the contents of the function instead of defining two versions of the function. This way we can save a few lines of code since we only need 2 of the ifdef/endif but needed 5 for the extra function declaration. Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/busy_poll.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 3fcda9e70c3f7..b82d6ba70a148 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -76,14 +76,6 @@ static inline bool busy_loop_timeout(unsigned long end_time) bool sk_busy_loop(struct sock *sk, int nonblock); -/* used in the NIC receive handler to mark the skb */ -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) -{ - skb->napi_id = napi->napi_id; -} - - #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { @@ -100,11 +92,6 @@ static inline bool sk_can_busy_loop(struct sock *sk) return false; } -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) -{ -} - static inline bool busy_loop_timeout(unsigned long end_time) { return true; @@ -117,6 +104,15 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock) #endif /* CONFIG_NET_RX_BUSY_POLL */ +/* used in the NIC receive handler to mark the skb */ +static inline void skb_mark_napi_id(struct sk_buff *skb, + struct napi_struct *napi) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + skb->napi_id = napi->napi_id; +#endif +} + /* used in the protocol hanlder to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { From 2b5cd0dfa384242f78a396b90087368c9440cc9a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:08:12 -0700 Subject: [PATCH 4/8] net: Change return type of sk_busy_loop from bool to void checking the return value of sk_busy_loop. As there are only a few consumers of that data, and the data being checked for can be replaced with a check for !skb_queue_empty() we might as well just pull the code out of sk_busy_loop and place it in the spots that actually need it. Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/busy_poll.h | 5 ++--- net/core/datagram.c | 8 ++++++-- net/core/dev.c | 25 +++++++++++-------------- net/sctp/socket.c | 9 ++++++--- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index b82d6ba70a148..c55760f4820fd 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -74,7 +74,7 @@ static inline bool busy_loop_timeout(unsigned long end_time) return time_after(now, end_time); } -bool sk_busy_loop(struct sock *sk, int nonblock); +void sk_busy_loop(struct sock *sk, int nonblock); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -97,9 +97,8 @@ static inline bool busy_loop_timeout(unsigned long end_time) return true; } -static inline bool sk_busy_loop(struct sock *sk, int nonblock) +static inline void sk_busy_loop(struct sock *sk, int nonblock) { - return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ diff --git a/net/core/datagram.c b/net/core/datagram.c index ea633342ab0d0..4608aa245410c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); - } while (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)); + + if (!sk_can_busy_loop(sk)) + break; + + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (!skb_queue_empty(&sk->sk_receive_queue)); error = -EAGAIN; diff --git a/net/core/dev.c b/net/core/dev.c index ab337bf5bbf43..af70eb6ba6827 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5060,21 +5060,19 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) do_softirq(); } -bool sk_busy_loop(struct sock *sk, int nonblock) +void sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; unsigned int napi_id; - int rc; restart: napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id < MIN_NAPI_ID) - return 0; + return; - rc = false; napi_poll = NULL; rcu_read_lock(); @@ -5085,7 +5083,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock) preempt_disable(); for (;;) { - rc = 0; + int work = 0; + local_bh_disable(); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -5103,12 +5102,12 @@ bool sk_busy_loop(struct sock *sk, int nonblock) have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - rc = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + work = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, work, BUSY_POLL_BUDGET); count: - if (rc > 0) + if (work > 0) __NET_ADD_STATS(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); + LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable(); if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || @@ -5121,9 +5120,9 @@ bool sk_busy_loop(struct sock *sk, int nonblock) preempt_enable(); rcu_read_unlock(); cond_resched(); - rc = !skb_queue_empty(&sk->sk_receive_queue); - if (rc || busy_loop_timeout(end_time)) - return rc; + if (!skb_queue_empty(&sk->sk_receive_queue) || + busy_loop_timeout(end_time)) + return; goto restart; } cpu_relax(); @@ -5131,10 +5130,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (napi_poll) busy_poll_stop(napi, have_poll_lock); preempt_enable(); - rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); - return rc; } EXPORT_SYMBOL(sk_busy_loop); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 72cc3ecf6516d..ccc08fc39722a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7518,9 +7518,12 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, if (sk->sk_shutdown & RCV_SHUTDOWN) break; - if (sk_can_busy_loop(sk) && - sk_busy_loop(sk, noblock)) - continue; + if (sk_can_busy_loop(sk)) { + sk_busy_loop(sk, noblock); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + continue; + } /* User doesn't want to wait. */ error = -EAGAIN; From 37056719bba500d0d2b8216fdf641e5507ec9a0e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:08:18 -0700 Subject: [PATCH 5/8] net: Track start of busy loop instead of when it should end This patch flips the logic we were using to determine if the busy polling has timed out. The main motivation for this is that we will need to support two different possible timeout values in the future and by recording the start time rather than when we would want to end we can focus on making the end_time specific to the task be it epoll or socket based polling. Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/select.c | 16 +++++----- include/net/busy_poll.h | 68 +++++++++++++++++++++++------------------ net/core/dev.c | 6 ++-- 3 files changed, 49 insertions(+), 41 deletions(-) diff --git a/fs/select.c b/fs/select.c index e2112270d75a5..9287d3a96e355 100644 --- a/fs/select.c +++ b/fs/select.c @@ -409,7 +409,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) int retval, i, timed_out = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -512,11 +512,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@ -800,7 +800,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, int timed_out = 0, count = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -853,11 +853,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c55760f4820fd..72c82f2ea5366 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -46,62 +46,70 @@ static inline bool net_busy_loop_on(void) return sysctl_net_busy_poll; } -static inline u64 busy_loop_us_clock(void) +static inline bool sk_can_busy_loop(const struct sock *sk) { - return local_clock() >> 10; + return sk->sk_ll_usec && !signal_pending(current); } -static inline unsigned long sk_busy_loop_end_time(struct sock *sk) -{ - return busy_loop_us_clock() + ACCESS_ONCE(sk->sk_ll_usec); -} +void sk_busy_loop(struct sock *sk, int nonblock); -/* in poll/select we use the global sysctl_net_ll_poll value */ -static inline unsigned long busy_loop_end_time(void) +#else /* CONFIG_NET_RX_BUSY_POLL */ +static inline unsigned long net_busy_loop_on(void) { - return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll); + return 0; } -static inline bool sk_can_busy_loop(const struct sock *sk) +static inline bool sk_can_busy_loop(struct sock *sk) { - return sk->sk_ll_usec && !signal_pending(current); + return false; } -static inline bool busy_loop_timeout(unsigned long end_time) +static inline void sk_busy_loop(struct sock *sk, int nonblock) { - unsigned long now = busy_loop_us_clock(); - - return time_after(now, end_time); } -void sk_busy_loop(struct sock *sk, int nonblock); +#endif /* CONFIG_NET_RX_BUSY_POLL */ -#else /* CONFIG_NET_RX_BUSY_POLL */ -static inline unsigned long net_busy_loop_on(void) +static inline unsigned long busy_loop_current_time(void) { +#ifdef CONFIG_NET_RX_BUSY_POLL + return (unsigned long)(local_clock() >> 10); +#else return 0; +#endif } -static inline unsigned long busy_loop_end_time(void) +/* in poll/select we use the global sysctl_net_ll_poll value */ +static inline bool busy_loop_timeout(unsigned long start_time) { - return 0; -} +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); -static inline bool sk_can_busy_loop(struct sock *sk) -{ - return false; -} + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); -static inline bool busy_loop_timeout(unsigned long end_time) -{ + return time_after(now, end_time); + } +#endif return true; } -static inline void sk_busy_loop(struct sock *sk, int nonblock) +static inline bool sk_busy_loop_timeout(struct sock *sk, + unsigned long start_time) { -} +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); -#endif /* CONFIG_NET_RX_BUSY_POLL */ + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); + + return time_after(now, end_time); + } +#endif + return true; +} /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index af70eb6ba6827..2d1b5613b7fdb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5062,7 +5062,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) void sk_busy_loop(struct sock *sk, int nonblock) { - unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + unsigned long start_time = nonblock ? 0 : busy_loop_current_time(); int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; @@ -5111,7 +5111,7 @@ void sk_busy_loop(struct sock *sk, int nonblock) local_bh_enable(); if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || - busy_loop_timeout(end_time)) + sk_busy_loop_timeout(sk, start_time)) break; if (unlikely(need_resched())) { @@ -5121,7 +5121,7 @@ void sk_busy_loop(struct sock *sk, int nonblock) rcu_read_unlock(); cond_resched(); if (!skb_queue_empty(&sk->sk_receive_queue) || - busy_loop_timeout(end_time)) + sk_busy_loop_timeout(sk, start_time)) return; goto restart; } From 7db6b048da3b9f84fe1d22fb29ff7e7c2ec6c0e5 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 24 Mar 2017 10:08:24 -0700 Subject: [PATCH 6/8] net: Commonize busy polling code to focus on napi_id instead of socket Move the core functionality in sk_busy_loop() to napi_busy_loop() and make it independent of sk. This enables re-using this function in epoll busy loop implementation. Signed-off-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/busy_poll.h | 20 +++++++++++++++----- net/core/dev.c | 21 ++++++++------------- net/core/sock.c | 11 +++++++++++ 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 72c82f2ea5366..8ffd434676b7a 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -51,7 +51,11 @@ static inline bool sk_can_busy_loop(const struct sock *sk) return sk->sk_ll_usec && !signal_pending(current); } -void sk_busy_loop(struct sock *sk, int nonblock); +bool sk_busy_loop_end(void *p, unsigned long start_time); + +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -64,10 +68,6 @@ static inline bool sk_can_busy_loop(struct sock *sk) return false; } -static inline void sk_busy_loop(struct sock *sk, int nonblock) -{ -} - #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) @@ -111,6 +111,16 @@ static inline bool sk_busy_loop_timeout(struct sock *sk, return true; } +static inline void sk_busy_loop(struct sock *sk, int nonblock) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id = READ_ONCE(sk->sk_napi_id); + + if (napi_id >= MIN_NAPI_ID) + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); +#endif +} + /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) diff --git a/net/core/dev.c b/net/core/dev.c index 2d1b5613b7fdb..ef9fe60ee294b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5060,19 +5060,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) do_softirq(); } -void sk_busy_loop(struct sock *sk, int nonblock) +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) { - unsigned long start_time = nonblock ? 0 : busy_loop_current_time(); + unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; - unsigned int napi_id; restart: - napi_id = READ_ONCE(sk->sk_napi_id); - if (napi_id < MIN_NAPI_ID) - return; - napi_poll = NULL; rcu_read_lock(); @@ -5106,12 +5103,11 @@ void sk_busy_loop(struct sock *sk, int nonblock) trace_napi_poll(napi, work, BUSY_POLL_BUDGET); count: if (work > 0) - __NET_ADD_STATS(sock_net(sk), + __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable(); - if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || - sk_busy_loop_timeout(sk, start_time)) + if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { @@ -5120,8 +5116,7 @@ void sk_busy_loop(struct sock *sk, int nonblock) preempt_enable(); rcu_read_unlock(); cond_resched(); - if (!skb_queue_empty(&sk->sk_receive_queue) || - sk_busy_loop_timeout(sk, start_time)) + if (loop_end(loop_end_arg, start_time)) return; goto restart; } @@ -5133,7 +5128,7 @@ void sk_busy_loop(struct sock *sk, int nonblock) out: rcu_read_unlock(); } -EXPORT_SYMBOL(sk_busy_loop); +EXPORT_SYMBOL(napi_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ diff --git a/net/core/sock.c b/net/core/sock.c index 1b9030ee6f4ba..4b762f2a3552b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3237,3 +3237,14 @@ static int __init proto_init(void) subsys_initcall(proto_init); #endif /* PROC_FS */ + +#ifdef CONFIG_NET_RX_BUSY_POLL +bool sk_busy_loop_end(void *p, unsigned long start_time) +{ + struct sock *sk = p; + + return !skb_queue_empty(&sk->sk_receive_queue) || + sk_busy_loop_timeout(sk, start_time); +} +EXPORT_SYMBOL(sk_busy_loop_end); +#endif /* CONFIG_NET_RX_BUSY_POLL */ From bf3b9f6372c45b0fbf24d86b8794910d20170017 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 24 Mar 2017 10:08:30 -0700 Subject: [PATCH 7/8] epoll: Add busy poll support to epoll with socket fds. This patch adds busy poll support to epoll. The implementation is meant to be opportunistic in that it will take the NAPI ID from the last socket that is added to the ready list that contains a valid NAPI ID and it will use that for busy polling until the ready list goes empty. Once the ready list goes empty the NAPI ID is reset and busy polling is disabled until a new socket is added to the ready list. In addition when we insert a new socket into the epoll we record the NAPI ID and assume we are going to receive events on it. If that doesn't occur it will be evicted as the active NAPI ID and we will resume normal behavior. An application can use SO_INCOMING_CPU or SO_REUSEPORT_ATTACH_C/EBPF socket options to spread the incoming connections to specific worker threads based on the incoming queue. This enables epoll for each worker thread to have only sockets that receive packets from a single queue. So when an application calls epoll_wait() and there are no events available to report, busy polling is done on the associated queue to pull the packets. Signed-off-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/eventpoll.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 341251421ced0..5420767c9b686 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -42,6 +42,7 @@ #include #include #include +#include /* * LOCKING: @@ -224,6 +225,11 @@ struct eventpoll { /* used to optimize loop detection check */ int visited; struct list_head visited_list_link; + +#ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + unsigned int napi_id; +#endif }; /* Wait structure used by the poll hooks */ @@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep) return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static bool ep_busy_loop_end(void *p, unsigned long start_time) +{ + struct eventpoll *ep = p; + + return ep_events_available(ep) || busy_loop_timeout(start_time); +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +/* + * Busy poll if globally on and supporting sockets found && no events, + * busy loop will return if need_resched or ep_events_available. + * + * we must do our busy polling with irqs enabled + */ +static void ep_busy_loop(struct eventpoll *ep, int nonblock) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); +#endif +} + +static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (ep->napi_id) + ep->napi_id = 0; +#endif +} + +/* + * Set epoll busy poll NAPI ID from sk. + */ +static inline void ep_set_busy_poll_napi_id(struct epitem *epi) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + struct eventpoll *ep; + unsigned int napi_id; + struct socket *sock; + struct sock *sk; + int err; + + if (!net_busy_loop_on()) + return; + + sock = sock_from_file(epi->ffd.file, &err); + if (!sock) + return; + + sk = sock->sk; + if (!sk) + return; + + napi_id = READ_ONCE(sk->sk_napi_id); + ep = epi->ep; + + /* Non-NAPI IDs can be rejected + * or + * Nothing to do if we already have this ID + */ + if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + return; + + /* record NAPI ID for use in next busy poll */ + ep->napi_id = napi_id; +#endif +} + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that @@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k spin_lock_irqsave(&ep->lock, flags); + ep_set_busy_poll_napi_id(epi); + /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the @@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); + /* record NAPI ID of new item if present */ + ep_set_busy_poll_napi_id(epi); + /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1637,9 +1719,20 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } fetch_events: + + if (!ep_events_available(ep)) + ep_busy_loop(ep, timed_out); + spin_lock_irqsave(&ep->lock, flags); if (!ep_events_available(ep)) { + /* + * Busy poll timed out. Drop NAPI ID for now, we can add + * it back in when we have moved a socket with a valid NAPI + * ID onto the ready list. + */ + ep_reset_busy_poll_napi_id(ep); + /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by From 6d4339028b350efbf87c61e6d9e113e5373545c9 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 24 Mar 2017 10:08:36 -0700 Subject: [PATCH 8/8] net: Introduce SO_INCOMING_NAPI_ID This socket option returns the NAPI ID associated with the queue on which the last frame is received. This information can be used by the apps to split the incoming flows among the threads based on the Rx queue on which they are received. If the NAPI ID actually represents a sender_cpu then the value is ignored and 0 is returned. Signed-off-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/avr32/include/uapi/asm/socket.h | 2 ++ arch/frv/include/uapi/asm/socket.h | 2 ++ arch/ia64/include/uapi/asm/socket.h | 2 ++ arch/m32r/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 1 + arch/mn10300/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/powerpc/include/uapi/asm/socket.h | 2 ++ arch/s390/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ arch/xtensa/include/uapi/asm/socket.h | 2 ++ include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 12 ++++++++++++ 14 files changed, 37 insertions(+) diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 089db42c1b402..1bb8cac61a284 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -101,4 +101,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 6eabcbd2f82a3..f824eeb0f2e4c 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -94,4 +94,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index bd497f8356b98..a8ad9bebfc47e 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -94,5 +94,7 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index f1bb546861687..6af3253e42095 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -103,4 +103,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 459c46076f6f8..e98b6bb897c0d 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -94,4 +94,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 688c18dd62ef6..ae2b62e39d4db 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -112,5 +112,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index 312d2c457a047..e4ac1843ee017 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -94,4 +94,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index b98ec38f20833..f754c793e82a4 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -93,4 +93,6 @@ #define SO_MEMINFO 0x4030 +#define SO_INCOMING_NAPI_ID 0x4031 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index 099a889240f67..5f84af7dcb2e5 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -101,4 +101,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 6199bb34f7fa6..25ac4960e7075 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -100,4 +100,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 12cd8c2ec4228..b05513acd589b 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -90,6 +90,8 @@ #define SO_MEMINFO 0x0039 +#define SO_INCOMING_NAPI_ID 0x003a + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index d0b85f6c14840..786606c81edd0 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -105,4 +105,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 8313702c1eae5..c98a52fb572a4 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -96,4 +96,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 4b762f2a3552b..1a58a9dc6888c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1328,6 +1328,18 @@ int sock_getsockopt(struct socket *sock, int level, int optname, goto lenout; } + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_INCOMING_NAPI_ID: + v.val = READ_ONCE(sk->sk_napi_id); + + /* aggregate non-NAPI IDs down to 0 */ + if (v.val < MIN_NAPI_ID) + v.val = 0; + + break; +#endif + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7).