From 217f6974368188fd8bd7804bf5a036aa5762c5e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2016 10:15:11 -0800 Subject: [PATCH 1/5] net: busy-poll: allow preemption in sk_busy_loop() After commit 4cd13c21b207 ("softirq: Let ksoftirqd do its job"), sk_busy_loop() needs a bit of care : softirqs might be delayed since we do not allow preemption yet. This patch adds preemptiom points in sk_busy_loop(), and makes sure no unnecessary cache line dirtying or atomic operations are done while looping. A new flag is added into napi->state : NAPI_STATE_IN_BUSY_POLL This prevents napi_complete_done() from clearing NAPIF_STATE_SCHED, so that sk_busy_loop() does not have to grab it again. Similarly, netpoll_poll_lock() is done one time. This gives about 10 to 20 % improvement in various busy polling tests, especially when many threads are busy polling in configurations with large number of NIC queues. This should allow experimenting with bigger delays without hurting overall latencies. Tested: On a 40Gb mlx4 NIC, 32 RX/TX queues. echo 70 >/proc/sys/net/core/busy_read for i in `seq 1 40`; do echo -n $i: ; ./super_netperf $i -H lpaa24 -t UDP_RR -- -N -n; done Before: After: 1: 90072 92819 2: 157289 184007 3: 235772 213504 4: 344074 357513 5: 394755 458267 6: 461151 487819 7: 549116 625963 8: 544423 716219 9: 720460 738446 10: 794686 837612 11: 915998 923960 12: 937507 925107 13: 1019677 971506 14: 1046831 1113650 15: 1114154 1148902 16: 1105221 1179263 17: 1266552 1299585 18: 1258454 1383817 19: 1341453 1312194 20: 1363557 1488487 21: 1387979 1501004 22: 1417552 1601683 23: 1550049 1642002 24: 1568876 1601915 25: 1560239 1683607 26: 1640207 1745211 27: 1706540 1723574 28: 1638518 1722036 29: 1734309 1757447 30: 1782007 1855436 31: 1724806 1888539 32: 1717716 1944297 33: 1778716 1869118 34: 1805738 1983466 35: 1815694 2020758 36: 1893059 2035632 37: 1843406 2034653 38: 1888830 2086580 39: 1972827 2143567 40: 1877729 2181851 Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Adam Belay Cc: Tariq Toukan Cc: Yuval Mintz Cc: Ariel Elior Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++ net/core/dev.c | 102 ++++++++++++++++++++++++++++++-------- 2 files changed, 92 insertions(+), 20 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 86bacf6a64f08..e71de66e37929 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -334,6 +334,16 @@ enum { NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ +}; + +enum { + NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED), + NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC), + NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED), + NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL), }; enum gro_result { diff --git a/net/core/dev.c b/net/core/dev.c index 6deba68ad9e48..369dcc8efc019 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4902,6 +4902,12 @@ void __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + /* Some drivers call us directly, instead of calling + * napi_complete_done(). + */ + if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) + return; + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); @@ -4913,10 +4919,13 @@ void napi_complete_done(struct napi_struct *n, int work_done) unsigned long flags; /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu + * 1) Don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu. + * 2) If we are busy polling, do nothing here, we have + * the guarantee we will be called later. */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) + if (unlikely(n->state & (NAPIF_STATE_NPSVC | + NAPIF_STATE_IN_BUSY_POLL))) return; if (n->gro_list) { @@ -4956,13 +4965,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) } #if defined(CONFIG_NET_RX_BUSY_POLL) + #define BUSY_POLL_BUDGET 8 + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +{ + int rc; + + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); + + /* All we really want here is to re-enable device interrupts. + * Ideally, a new ndo_busy_poll_stop() could avoid another round. + */ + rc = napi->poll(napi, BUSY_POLL_BUDGET); + netpoll_poll_unlock(have_poll_lock); + if (rc == BUSY_POLL_BUDGET) + __napi_schedule(napi); + local_bh_enable(); + if (local_softirq_pending()) + do_softirq(); +} + bool sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + int (*napi_poll)(struct napi_struct *napi, int budget); int (*busy_poll)(struct napi_struct *dev); + void *have_poll_lock = NULL; struct napi_struct *napi; - int rc = false; + int rc; + +restart: + rc = false; + napi_poll = NULL; rcu_read_lock(); @@ -4973,24 +5010,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock) /* Note: ndo_busy_poll method is optional in linux-4.5 */ busy_poll = napi->dev->netdev_ops->ndo_busy_poll; - do { + preempt_disable(); + for (;;) { rc = 0; local_bh_disable(); if (busy_poll) { rc = busy_poll(napi); - } else if (napi_schedule_prep(napi)) { - void *have = netpoll_poll_lock(napi); - - if (test_bit(NAPI_STATE_SCHED, &napi->state)) { - rc = napi->poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); - if (rc == BUSY_POLL_BUDGET) { - napi_complete_done(napi, rc); - napi_schedule(napi); - } - } - netpoll_poll_unlock(have); + goto count; } + if (!napi_poll) { + unsigned long val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) + goto count; + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) + goto count; + have_poll_lock = netpoll_poll_lock(napi); + napi_poll = napi->poll; + } + rc = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); +count: if (rc > 0) __NET_ADD_STATS(sock_net(sk), LINUX_MIB_BUSYPOLLRXPACKETS, rc); @@ -4999,10 +5045,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (rc == LL_FLUSH_FAILED) break; /* permanent failure */ - cpu_relax(); - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - !need_resched() && !busy_loop_timeout(end_time)); + if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || + busy_loop_timeout(end_time)) + break; + if (unlikely(need_resched())) { + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + rc = !skb_queue_empty(&sk->sk_receive_queue); + if (rc || busy_loop_timeout(end_time)) + return rc; + goto restart; + } + cpu_relax_lowlatency(); + } + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); From 21cb84c48ca0619181106f0f44f3802a989de024 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2016 10:15:12 -0800 Subject: [PATCH 2/5] net: busy-poll: remove need_resched() from sk_can_busy_loop() Now sk_busy_loop() can schedule by itself, we can remove need_resched() check from sk_can_busy_loop() Also add a const to its struct sock parameter. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Adam Belay Cc: Tariq Toukan Cc: Yuval Mintz Cc: Ariel Elior Signed-off-by: David S. Miller --- include/net/busy_poll.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 2fbeb1313c0f4..965e52b9b5a3d 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -58,10 +58,9 @@ static inline unsigned long busy_loop_end_time(void) return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll); } -static inline bool sk_can_busy_loop(struct sock *sk) +static inline bool sk_can_busy_loop(const struct sock *sk) { - return sk->sk_ll_usec && sk->sk_napi_id && - !need_resched() && !signal_pending(current); + return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current); } From 364b6055738b4c752c30ccaaf25c624e69d76195 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2016 10:15:13 -0800 Subject: [PATCH 3/5] net: busy-poll: return busypolling status to drivers NAPI drivers use napi_complete_done() or napi_complete() when they drained RX ring and right before re-enabling device interrupts. In busy polling, we can avoid interrupts being delivered since we are polling RX ring in a controlled loop. Drivers can chose to use napi_complete_done() return value to reduce interrupts overhead while busy polling is active. This is optional, legacy drivers should work fine even if not updated. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Adam Belay Cc: Tariq Toukan Cc: Yuval Mintz Cc: Ariel Elior Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++--- net/core/dev.c | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e71de66e37929..bcddf951ccee9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -463,16 +463,17 @@ static inline bool napi_reschedule(struct napi_struct *napi) return false; } -void __napi_complete(struct napi_struct *n); -void napi_complete_done(struct napi_struct *n, int work_done); +bool __napi_complete(struct napi_struct *n); +bool napi_complete_done(struct napi_struct *n, int work_done); /** * napi_complete - NAPI processing complete * @n: NAPI context * * Mark NAPI processing as complete. * Consider using napi_complete_done() instead. + * Return false if device should avoid rearming interrupts. */ -static inline void napi_complete(struct napi_struct *n) +static inline bool napi_complete(struct napi_struct *n) { return napi_complete_done(n, 0); } diff --git a/net/core/dev.c b/net/core/dev.c index 369dcc8efc019..edba9efeb2e93 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4898,7 +4898,7 @@ void __napi_schedule_irqoff(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule_irqoff); -void __napi_complete(struct napi_struct *n) +bool __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); @@ -4906,15 +4906,16 @@ void __napi_complete(struct napi_struct *n) * napi_complete_done(). */ if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) - return; + return false; list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); + return true; } EXPORT_SYMBOL(__napi_complete); -void napi_complete_done(struct napi_struct *n, int work_done) +bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; @@ -4926,7 +4927,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) */ if (unlikely(n->state & (NAPIF_STATE_NPSVC | NAPIF_STATE_IN_BUSY_POLL))) - return; + return false; if (n->gro_list) { unsigned long timeout = 0; @@ -4948,6 +4949,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) __napi_complete(n); local_irq_restore(flags); } + return true; } EXPORT_SYMBOL(napi_complete_done); From 2e713283751f494596655d9125c168aeb913f71d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2016 10:15:14 -0800 Subject: [PATCH 4/5] net/mlx4_en: use napi_complete_done() return value Do not rearm interrupts if we are busy polling. mlx4 uses separate CQ for TX and RX, so number of TX interrupts does not change, unfortunately. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Adam Belay Cc: Tariq Toukan Cc: Yuval Mintz Cc: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 2cc91002064f9..22f08f9ef4645 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -1137,8 +1137,8 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) done = 0; } /* Done for now */ - napi_complete_done(napi, done); - mlx4_en_arm_cq(priv, cq); + if (napi_complete_done(napi, done)) + mlx4_en_arm_cq(priv, cq); return done; } From 80f1c21c53dc5906944eac52d643f63e82ad1673 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2016 10:15:15 -0800 Subject: [PATCH 5/5] bnx2x: switch to napi_complete_done() Switch from napi_complete() to napi_complete_done() for better GRO support (gro_flush_timeout) and core NAPI features. Do not rearm interrupts if we are busy polling, to reduce bus and interrupts overhead. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Adam Belay Cc: Tariq Toukan Cc: Yuval Mintz Cc: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index ed42c10096855..3fd36b421d51b 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -3248,13 +3248,14 @@ static int bnx2x_poll(struct napi_struct *napi, int budget) rmb(); if (!(bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) { - napi_complete(napi); - /* Re-enable interrupts */ - DP(NETIF_MSG_RX_STATUS, - "Update index to %d\n", fp->fp_hc_idx); - bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID, - le16_to_cpu(fp->fp_hc_idx), - IGU_INT_ENABLE, 1); + if (napi_complete_done(napi, rx_work_done)) { + /* Re-enable interrupts */ + DP(NETIF_MSG_RX_STATUS, + "Update index to %d\n", fp->fp_hc_idx); + bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID, + le16_to_cpu(fp->fp_hc_idx), + IGU_INT_ENABLE, 1); + } } else { rx_work_done = budget; }