Skip to content

Commit

Permalink
net: poll/select low latency socket support
Browse files Browse the repository at this point in the history
select/poll busy-poll support.

Split sysctl value into two separate ones, one for read and one for poll.
updated Documentation/sysctl/net.txt

Add a new poll flag POLL_LL. When this flag is set, sock_poll will call
sk_poll_ll if possible. sock_poll sets this flag in its return value
to indicate to select/poll when a socket that can busy poll is found.

When poll/select have nothing to report, call the low-level
sock_poll again until we are out of time or we find something.

Once the system call finds something, it stops setting POLL_LL, so it can
return the result to the user ASAP.

Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eliezer Tamir authored and David S. Miller committed Jun 25, 2013
1 parent e4f2379 commit 2d48d67
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 22 deletions.
18 changes: 16 additions & 2 deletions Documentation/sysctl/net.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
it's a Per-CPU variable.
Default: 64

low_latency_poll
low_latency_read
----------------
Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL)
Low latency busy poll timeout for socket reads. (needs CONFIG_NET_LL_RX_POLL)
Approximate time in us to spin waiting for packets on the device queue.
This sets the default value of the SO_LL socket option.
Can be set or overridden per socket by setting socket option SO_LL.
Recommended value is 50. May increase power usage.
Default: 0 (off)

low_latency_poll
----------------
Low latency busy poll timeout for poll and select. (needs CONFIG_NET_LL_RX_POLL)
Approximate time in us to spin waiting for packets on the device queue.
Recommended value depends on the number of sockets you poll on.
For several sockets 50, for several hundreds 100.
For more than that you probably want to use epoll.
Note that only sockets with SO_LL set will be busy polled, so you want to either
selectively set SO_LL on those sockets or set sysctl.net.low_latency_read globally.
May increase power usage.
Default: 0 (off)

rmem_default
------------

Expand Down
34 changes: 29 additions & 5 deletions fs/select.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
#include <linux/sched/rt.h>
#include <net/ll_poll.h>

#include <asm/uaccess.h>

Expand Down Expand Up @@ -384,9 +385,10 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds)
#define POLLEX_SET (POLLPRI)

static inline void wait_key_set(poll_table *wait, unsigned long in,
unsigned long out, unsigned long bit)
unsigned long out, unsigned long bit,
unsigned int ll_flag)
{
wait->_key = POLLEX_SET;
wait->_key = POLLEX_SET | ll_flag;
if (in & bit)
wait->_key |= POLLIN_SET;
if (out & bit)
Expand All @@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
poll_table *wait;
int retval, i, timed_out = 0;
unsigned long slack = 0;
unsigned int ll_flag = POLL_LL;
u64 ll_time = ll_end_time();

rcu_read_lock();
retval = max_select_fd(n, fds);
Expand All @@ -422,6 +426,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
bool can_ll = false;

inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
Expand Down Expand Up @@ -449,7 +454,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
f_op = f.file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll) {
wait_key_set(wait, in, out, bit);
wait_key_set(wait, in, out,
bit, ll_flag);
mask = (*f_op->poll)(f.file, wait);
}
fdput(f);
Expand All @@ -468,6 +474,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval++;
wait->_qproc = NULL;
}
if (mask & POLL_LL)
can_ll = true;
/* got something, stop busy polling */
if (retval)
ll_flag = 0;
}
}
if (res_in)
Expand All @@ -486,6 +497,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
break;
}

if (can_ll && can_poll_ll(ll_time))
continue;

/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
Expand Down Expand Up @@ -717,7 +731,8 @@ struct poll_list {
* pwait poll_table will be used by the fd-provided poll handler for waiting,
* if pwait->_qproc is non-NULL.
*/
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
bool *can_ll, unsigned int ll_flag)
{
unsigned int mask;
int fd;
Expand All @@ -731,7 +746,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
mask = DEFAULT_POLLMASK;
if (f.file->f_op && f.file->f_op->poll) {
pwait->_key = pollfd->events|POLLERR|POLLHUP;
pwait->_key |= ll_flag;
mask = f.file->f_op->poll(f.file, pwait);
if (mask & POLL_LL)
*can_ll = true;
}
/* Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP;
Expand All @@ -750,6 +768,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;
unsigned int ll_flag = POLL_LL;
u64 ll_time = ll_end_time();

/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
Expand All @@ -762,6 +782,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,

for (;;) {
struct poll_list *walk;
bool can_ll = false;

for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
Expand All @@ -776,9 +797,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt)) {
if (do_pollfd(pfd, pt, &can_ll, ll_flag)) {
count++;
pt->_qproc = NULL;
ll_flag = 0;
}
}
}
Expand All @@ -795,6 +817,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
if (count || timed_out)
break;

if (can_ll && can_poll_ll(ll_time))
continue;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
Expand Down
35 changes: 22 additions & 13 deletions include/net/ll_poll.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#ifdef CONFIG_NET_LL_RX_POLL

struct napi_struct;
extern unsigned int sysctl_net_ll_read __read_mostly;
extern unsigned int sysctl_net_ll_poll __read_mostly;

/* return values from ndo_ll_poll */
Expand All @@ -38,17 +39,18 @@ extern unsigned int sysctl_net_ll_poll __read_mostly;

/* we can use sched_clock() because we don't care much about precision
* we only care that the average is bounded
* we don't mind a ~2.5% imprecision so <<10 instead of *1000
* sk->sk_ll_usec is a u_int so this can't overflow
*/
static inline u64 ll_end_time(struct sock *sk)
static inline u64 ll_sk_end_time(struct sock *sk)
{
u64 end_time = ACCESS_ONCE(sk->sk_ll_usec);

/* we don't mind a ~2.5% imprecision
* sk->sk_ll_usec is a u_int so this can't overflow
*/
end_time = (end_time << 10) + sched_clock();
return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + sched_clock();
}

return end_time;
/* in poll/select we use the global sysctl_net_ll_poll value */
static inline u64 ll_end_time(void)
{
return ((u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10) + sched_clock();
}

static inline bool sk_valid_ll(struct sock *sk)
Expand All @@ -62,10 +64,13 @@ static inline bool can_poll_ll(u64 end_time)
return !time_after64(sched_clock(), end_time);
}

/* when used in sock_poll() nonblock is known at compile time to be true
* so the loop and end_time will be optimized out
*/
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
{
u64 end_time = nonblock ? 0 : ll_sk_end_time(sk);
const struct net_device_ops *ops;
u64 end_time = ll_end_time(sk);
struct napi_struct *napi;
int rc = false;

Expand All @@ -84,7 +89,6 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock)
goto out;

do {

rc = ops->ndo_ll_poll(napi);

if (rc == LL_FLUSH_FAILED)
Expand All @@ -95,8 +99,8 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock)
NET_ADD_STATS_BH(sock_net(sk),
LINUX_MIB_LOWLATENCYRXPACKETS, rc);

} while (skb_queue_empty(&sk->sk_receive_queue)
&& can_poll_ll(end_time) && !nonblock);
} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
can_poll_ll(end_time));

rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
Expand All @@ -118,7 +122,12 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)

#else /* CONFIG_NET_LL_RX_POLL */

static inline u64 ll_end_time(struct sock *sk)
static inline u64 sk_ll_end_time(struct sock *sk)
{
return 0;
}

static inline u64 ll_end_time(void)
{
return 0;
}
Expand Down
2 changes: 2 additions & 0 deletions include/uapi/asm-generic/poll.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@

#define POLLFREE 0x4000 /* currently only for epoll */

#define POLL_LL 0x8000

struct pollfd {
int fd;
short events;
Expand Down
2 changes: 1 addition & 1 deletion net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -2307,7 +2307,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)

#ifdef CONFIG_NET_LL_RX_POLL
sk->sk_napi_id = 0;
sk->sk_ll_usec = sysctl_net_ll_poll;
sk->sk_ll_usec = sysctl_net_ll_read;
#endif

/*
Expand Down
8 changes: 8 additions & 0 deletions net/core/sysctl_net_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,14 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "low_latency_read",
.data = &sysctl_net_ll_read,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec
},
#
#endif
#endif /* CONFIG_NET */
{
Expand Down
14 changes: 13 additions & 1 deletion net/socket.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
#include <net/ll_poll.h>

#ifdef CONFIG_NET_LL_RX_POLL
unsigned int sysctl_net_ll_read __read_mostly;
unsigned int sysctl_net_ll_poll __read_mostly;
#endif

Expand Down Expand Up @@ -1147,13 +1148,24 @@ EXPORT_SYMBOL(sock_create_lite);
/* No kernel lock held - perfect */
static unsigned int sock_poll(struct file *file, poll_table *wait)
{
unsigned int ll_flag = 0;
struct socket *sock;

/*
* We can't return errors to poll, so it's either yes or no.
*/
sock = file->private_data;
return sock->ops->poll(file, sock, wait);

if (sk_valid_ll(sock->sk)) {
/* this socket can poll_ll so tell the system call */
ll_flag = POLL_LL;

/* once, only if requested by syscall */
if (wait && (wait->_key & POLL_LL))
sk_poll_ll(sock->sk, 1);
}

return ll_flag | sock->ops->poll(file, sock, wait);
}

static int sock_mmap(struct file *file, struct vm_area_struct *vma)
Expand Down

0 comments on commit 2d48d67

Please sign in to comment.