Skip to content

Commit

Permalink
Merge branch 'busypoll-preemption-and-other-optimizations'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
net: busy-poll: allow preemption and other optimizations

It is time to have preemption points in sk_busy_loop() and improve
its scalability.

Also napi_complete() and friends can tell drivers when it is safe to
not re-enable device interrupts, saving some overhead under
high busy polling.

mlx4 and bnx2x are changed accordingly, to show how this busy polling
status can be exploited by drivers.

Next steps will implement Zach Brown suggestion, where NAPI polling
would be enabled all the time for some chosen queues.
This is needed for efficient epoll() support anyway.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Nov 16, 2016
2 parents 2874aa2 + 80f1c21 commit fc3f914
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 38 deletions.
15 changes: 8 additions & 7 deletions drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
Original file line number Diff line number Diff line change
Expand Up @@ -3248,13 +3248,14 @@ static int bnx2x_poll(struct napi_struct *napi, int budget)
rmb();

if (!(bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) {
napi_complete(napi);
/* Re-enable interrupts */
DP(NETIF_MSG_RX_STATUS,
"Update index to %d\n", fp->fp_hc_idx);
bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID,
le16_to_cpu(fp->fp_hc_idx),
IGU_INT_ENABLE, 1);
if (napi_complete_done(napi, rx_work_done)) {
/* Re-enable interrupts */
DP(NETIF_MSG_RX_STATUS,
"Update index to %d\n", fp->fp_hc_idx);
bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID,
le16_to_cpu(fp->fp_hc_idx),
IGU_INT_ENABLE, 1);
}
} else {
rx_work_done = budget;
}
Expand Down
4 changes: 2 additions & 2 deletions drivers/net/ethernet/mellanox/mlx4/en_rx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1137,8 +1137,8 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
done = 0;
}
/* Done for now */
napi_complete_done(napi, done);
mlx4_en_arm_cq(priv, cq);
if (napi_complete_done(napi, done))
mlx4_en_arm_cq(priv, cq);
return done;
}

Expand Down
17 changes: 14 additions & 3 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,16 @@ enum {
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
};

enum {
NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED),
NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE),
NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC),
NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED),
NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
};

enum gro_result {
Expand Down Expand Up @@ -453,16 +463,17 @@ static inline bool napi_reschedule(struct napi_struct *napi)
return false;
}

void __napi_complete(struct napi_struct *n);
void napi_complete_done(struct napi_struct *n, int work_done);
bool __napi_complete(struct napi_struct *n);
bool napi_complete_done(struct napi_struct *n, int work_done);
/**
* napi_complete - NAPI processing complete
* @n: NAPI context
*
* Mark NAPI processing as complete.
* Consider using napi_complete_done() instead.
* Return false if device should avoid rearming interrupts.
*/
static inline void napi_complete(struct napi_struct *n)
static inline bool napi_complete(struct napi_struct *n)
{
return napi_complete_done(n, 0);
}
Expand Down
5 changes: 2 additions & 3 deletions include/net/busy_poll.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,9 @@ static inline unsigned long busy_loop_end_time(void)
return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll);
}

static inline bool sk_can_busy_loop(struct sock *sk)
static inline bool sk_can_busy_loop(const struct sock *sk)
{
return sk->sk_ll_usec && sk->sk_napi_id &&
!need_resched() && !signal_pending(current);
return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current);
}


Expand Down
110 changes: 87 additions & 23 deletions net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -4898,26 +4898,36 @@ void __napi_schedule_irqoff(struct napi_struct *n)
}
EXPORT_SYMBOL(__napi_schedule_irqoff);

void __napi_complete(struct napi_struct *n)
bool __napi_complete(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));

/* Some drivers call us directly, instead of calling
* napi_complete_done().
*/
if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
return false;

list_del_init(&n->poll_list);
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
return true;
}
EXPORT_SYMBOL(__napi_complete);

void napi_complete_done(struct napi_struct *n, int work_done)
bool napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags;

/*
* don't let napi dequeue from the cpu poll list
* just in case its running on a different cpu
* 1) Don't let napi dequeue from the cpu poll list
* just in case its running on a different cpu.
* 2) If we are busy polling, do nothing here, we have
* the guarantee we will be called later.
*/
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
return;
if (unlikely(n->state & (NAPIF_STATE_NPSVC |
NAPIF_STATE_IN_BUSY_POLL)))
return false;

if (n->gro_list) {
unsigned long timeout = 0;
Expand All @@ -4939,6 +4949,7 @@ void napi_complete_done(struct napi_struct *n, int work_done)
__napi_complete(n);
local_irq_restore(flags);
}
return true;
}
EXPORT_SYMBOL(napi_complete_done);

Expand All @@ -4956,13 +4967,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
}

#if defined(CONFIG_NET_RX_BUSY_POLL)

#define BUSY_POLL_BUDGET 8

static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
{
int rc;

clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);

local_bh_disable();

/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
rc = napi->poll(napi, BUSY_POLL_BUDGET);
netpoll_poll_unlock(have_poll_lock);
if (rc == BUSY_POLL_BUDGET)
__napi_schedule(napi);
local_bh_enable();
if (local_softirq_pending())
do_softirq();
}

bool sk_busy_loop(struct sock *sk, int nonblock)
{
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
int (*busy_poll)(struct napi_struct *dev);
void *have_poll_lock = NULL;
struct napi_struct *napi;
int rc = false;
int rc;

restart:
rc = false;
napi_poll = NULL;

rcu_read_lock();

Expand All @@ -4973,24 +5012,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
/* Note: ndo_busy_poll method is optional in linux-4.5 */
busy_poll = napi->dev->netdev_ops->ndo_busy_poll;

do {
preempt_disable();
for (;;) {
rc = 0;
local_bh_disable();
if (busy_poll) {
rc = busy_poll(napi);
} else if (napi_schedule_prep(napi)) {
void *have = netpoll_poll_lock(napi);

if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
rc = napi->poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
if (rc == BUSY_POLL_BUDGET) {
napi_complete_done(napi, rc);
napi_schedule(napi);
}
}
netpoll_poll_unlock(have);
goto count;
}
if (!napi_poll) {
unsigned long val = READ_ONCE(napi->state);

/* If multiple threads are competing for this napi,
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
NAPIF_STATE_IN_BUSY_POLL))
goto count;
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
NAPIF_STATE_SCHED) != val)
goto count;
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
rc = napi_poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
count:
if (rc > 0)
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_BUSYPOLLRXPACKETS, rc);
Expand All @@ -4999,10 +5047,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
if (rc == LL_FLUSH_FAILED)
break; /* permanent failure */

cpu_relax();
} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
!need_resched() && !busy_loop_timeout(end_time));
if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
busy_loop_timeout(end_time))
break;

if (unlikely(need_resched())) {
if (napi_poll)
busy_poll_stop(napi, have_poll_lock);
preempt_enable();
rcu_read_unlock();
cond_resched();
rc = !skb_queue_empty(&sk->sk_receive_queue);
if (rc || busy_loop_timeout(end_time))
return rc;
goto restart;
}
cpu_relax_lowlatency();
}
if (napi_poll)
busy_poll_stop(napi, have_poll_lock);
preempt_enable();
rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
rcu_read_unlock();
Expand Down

0 comments on commit fc3f914

Please sign in to comment.