Skip to content

Commit

Permalink
net: sock_def_readable() and friends RCU conversion
Browse files Browse the repository at this point in the history
sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we
need two atomic operations (and associated dirtying) per incoming
packet.

RCU conversion is pretty much needed :

1) Add a new structure, called "struct socket_wq" to hold all fields
that will need rcu_read_lock() protection (currently: a
wait_queue_head_t and a struct fasync_struct pointer).

[Future patch will add a list anchor for wakeup coalescing]

2) Attach one of such structure to each "struct socket" created in
sock_alloc_inode().

3) Respect RCU grace period when freeing a "struct socket_wq"

4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct
socket_wq"

5) Change sk_sleep() function to use new sk->sk_wq instead of
sk->sk_sleep

6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside
a rcu_read_lock() section.

7) Change all sk_has_sleeper() callers to :
  - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock)
  - Use wq_has_sleeper() to eventually wakeup tasks.
  - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock)

8) sock_wake_async() is modified to use rcu protection as well.

9) Exceptions :
  macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq"
instead of dynamically allocated ones. They dont need rcu freeing.

Some cleanups or followups are probably needed, (possible
sk_callback_lock conversion to a spinlock for example...).

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eric Dumazet authored and David S. Miller committed May 1, 2010
1 parent 83d7eb2 commit 4381548
Show file tree
Hide file tree
Showing 16 changed files with 181 additions and 114 deletions.
13 changes: 9 additions & 4 deletions drivers/net/macvtap.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
struct macvtap_queue {
struct sock sk;
struct socket sock;
struct socket_wq wq;
struct macvlan_dev *vlan;
struct file *file;
unsigned int flags;
Expand Down Expand Up @@ -242,12 +243,15 @@ static struct rtnl_link_ops macvtap_link_ops __read_mostly = {

static void macvtap_sock_write_space(struct sock *sk)
{
wait_queue_head_t *wqueue;

if (!sock_writeable(sk) ||
!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
return;

if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND);
wqueue = sk_sleep(sk);
if (wqueue && waitqueue_active(wqueue))
wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
}

static int macvtap_open(struct inode *inode, struct file *file)
Expand All @@ -272,7 +276,8 @@ static int macvtap_open(struct inode *inode, struct file *file)
if (!q)
goto out;

init_waitqueue_head(&q->sock.wait);
q->sock.wq = &q->wq;
init_waitqueue_head(&q->wq.wait);
q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED;
q->sock.file = file;
Expand Down Expand Up @@ -308,7 +313,7 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
goto out;

mask = 0;
poll_wait(file, &q->sock.wait, wait);
poll_wait(file, &q->wq.wait, wait);

if (!skb_queue_empty(&q->sk.sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
Expand Down
21 changes: 12 additions & 9 deletions drivers/net/tun.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ struct tun_struct {

struct tap_filter txflt;
struct socket socket;

struct socket_wq wq;
#ifdef TUN_DEBUG
int debug;
#endif
Expand Down Expand Up @@ -323,7 +323,7 @@ static void tun_net_uninit(struct net_device *dev)
/* Inform the methods they need to stop using the dev.
*/
if (tfile) {
wake_up_all(&tun->socket.wait);
wake_up_all(&tun->wq.wait);
if (atomic_dec_and_test(&tfile->count))
__tun_detach(tun);
}
Expand Down Expand Up @@ -398,7 +398,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
/* Notify and wake up reader process */
if (tun->flags & TUN_FASYNC)
kill_fasync(&tun->fasync, SIGIO, POLL_IN);
wake_up_interruptible_poll(&tun->socket.wait, POLLIN |
wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
POLLRDNORM | POLLRDBAND);
return NETDEV_TX_OK;

Expand Down Expand Up @@ -498,7 +498,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)

DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name);

poll_wait(file, &tun->socket.wait, wait);
poll_wait(file, &tun->wq.wait, wait);

if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
Expand Down Expand Up @@ -773,7 +773,7 @@ static ssize_t tun_do_read(struct tun_struct *tun,

DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name);

add_wait_queue(&tun->socket.wait, &wait);
add_wait_queue(&tun->wq.wait, &wait);
while (len) {
current->state = TASK_INTERRUPTIBLE;

Expand Down Expand Up @@ -804,7 +804,7 @@ static ssize_t tun_do_read(struct tun_struct *tun,
}

current->state = TASK_RUNNING;
remove_wait_queue(&tun->socket.wait, &wait);
remove_wait_queue(&tun->wq.wait, &wait);

return ret;
}
Expand Down Expand Up @@ -861,15 +861,17 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
static void tun_sock_write_space(struct sock *sk)
{
struct tun_struct *tun;
wait_queue_head_t *wqueue;

if (!sock_writeable(sk))
return;

if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
return;

if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT |
wqueue = sk_sleep(sk);
if (wqueue && waitqueue_active(wqueue))
wake_up_interruptible_sync_poll(wqueue, POLLOUT |
POLLWRNORM | POLLWRBAND);

tun = tun_sk(sk)->tun;
Expand Down Expand Up @@ -1039,7 +1041,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
if (!sk)
goto err_free_dev;

init_waitqueue_head(&tun->socket.wait);
tun->socket.wq = &tun->wq;
init_waitqueue_head(&tun->wq.wait);
tun->socket.ops = &tun_socket_ops;
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;
Expand Down
14 changes: 9 additions & 5 deletions include/linux/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ typedef enum {
#include <linux/wait.h>
#include <linux/fcntl.h> /* For O_CLOEXEC and O_NONBLOCK */
#include <linux/kmemcheck.h>
#include <linux/rcupdate.h>

struct poll_table_struct;
struct pipe_inode_info;
Expand Down Expand Up @@ -116,6 +117,12 @@ enum sock_shutdown_cmd {
SHUT_RDWR = 2,
};

struct socket_wq {
wait_queue_head_t wait;
struct fasync_struct *fasync_list;
struct rcu_head rcu;
} ____cacheline_aligned_in_smp;

/**
* struct socket - general BSD socket
* @state: socket state (%SS_CONNECTED, etc)
Expand All @@ -135,11 +142,8 @@ struct socket {
kmemcheck_bitfield_end(type);

unsigned long flags;
/*
* Please keep fasync_list & wait fields in the same cache line
*/
struct fasync_struct *fasync_list;
wait_queue_head_t wait;

struct socket_wq *wq;

struct file *file;
struct sock *sk;
Expand Down
20 changes: 11 additions & 9 deletions include/net/af_unix.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ struct unix_skb_parms {
#endif
};

#define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb))
#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb))
#define UNIXCREDS(skb) (&UNIXCB((skb)).creds)
#define UNIXSID(skb) (&UNIXCB((skb)).secid)

Expand All @@ -45,21 +45,23 @@ struct unix_skb_parms {
struct unix_sock {
/* WARNING: sk has to be the first member */
struct sock sk;
struct unix_address *addr;
struct dentry *dentry;
struct vfsmount *mnt;
struct unix_address *addr;
struct dentry *dentry;
struct vfsmount *mnt;
struct mutex readlock;
struct sock *peer;
struct sock *other;
struct sock *peer;
struct sock *other;
struct list_head link;
atomic_long_t inflight;
spinlock_t lock;
atomic_long_t inflight;
spinlock_t lock;
unsigned int gc_candidate : 1;
unsigned int gc_maybe_cycle : 1;
wait_queue_head_t peer_wait;
struct socket_wq peer_wq;
};
#define unix_sk(__sk) ((struct unix_sock *)__sk)

#define peer_wait peer_wq.wait

#ifdef CONFIG_SYSCTL
extern int unix_sysctl_register(struct net *net);
extern void unix_sysctl_unregister(struct net *net);
Expand Down
38 changes: 19 additions & 19 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ struct sock_common {
* @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
* @sk_lock: synchronizer
* @sk_rcvbuf: size of receive buffer in bytes
* @sk_sleep: sock wait queue
* @sk_wq: sock wait queue and async head
* @sk_dst_cache: destination cache
* @sk_dst_lock: destination cache lock
* @sk_policy: flow policy
Expand Down Expand Up @@ -257,7 +257,7 @@ struct sock {
struct sk_buff *tail;
int len;
} sk_backlog;
wait_queue_head_t *sk_sleep;
struct socket_wq *sk_wq;
struct dst_entry *sk_dst_cache;
#ifdef CONFIG_XFRM
struct xfrm_policy *sk_policy[2];
Expand Down Expand Up @@ -1219,7 +1219,7 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock)

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
return sk->sk_sleep;
return &sk->sk_wq->wait;
}
/* Detach socket from process context.
* Announce socket dead, detach it from wait queue and inode.
Expand All @@ -1233,14 +1233,14 @@ static inline void sock_orphan(struct sock *sk)
write_lock_bh(&sk->sk_callback_lock);
sock_set_flag(sk, SOCK_DEAD);
sk_set_socket(sk, NULL);
sk->sk_sleep = NULL;
sk->sk_wq = NULL;
write_unlock_bh(&sk->sk_callback_lock);
}

static inline void sock_graft(struct sock *sk, struct socket *parent)
{
write_lock_bh(&sk->sk_callback_lock);
sk->sk_sleep = &parent->wait;
rcu_assign_pointer(sk->sk_wq, parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
security_sock_graft(sk, parent);
Expand Down Expand Up @@ -1392,12 +1392,12 @@ static inline int sk_has_allocations(const struct sock *sk)
}

/**
* sk_has_sleeper - check if there are any waiting processes
* @sk: socket
* wq_has_sleeper - check if there are any waiting processes
* @sk: struct socket_wq
*
* Returns true if socket has waiting processes
* Returns true if socket_wq has waiting processes
*
* The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory
* The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
* barrier call. They were added due to the race found within the tcp code.
*
* Consider following tcp code paths:
Expand All @@ -1410,9 +1410,10 @@ static inline int sk_has_allocations(const struct sock *sk)
* ... ...
* tp->rcv_nxt check sock_def_readable
* ... {
* schedule ...
* if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
* wake_up_interruptible(sk_sleep(sk))
* schedule rcu_read_lock();
* wq = rcu_dereference(sk->sk_wq);
* if (wq && waitqueue_active(&wq->wait))
* wake_up_interruptible(&wq->wait)
* ...
* }
*
Expand All @@ -1421,19 +1422,18 @@ static inline int sk_has_allocations(const struct sock *sk)
* could then endup calling schedule and sleep forever if there are no more
* data on the socket.
*
* The sk_has_sleeper is always called right after a call to read_lock, so we
* can use smp_mb__after_lock barrier.
*/
static inline int sk_has_sleeper(struct sock *sk)
static inline bool wq_has_sleeper(struct socket_wq *wq)
{

/*
* We need to be sure we are in sync with the
* add_wait_queue modifications to the wait queue.
*
* This memory barrier is paired in the sock_poll_wait.
*/
smp_mb__after_lock();
return sk_sleep(sk) && waitqueue_active(sk_sleep(sk));
smp_mb();
return wq && waitqueue_active(&wq->wait);
}

/**
Expand All @@ -1442,7 +1442,7 @@ static inline int sk_has_sleeper(struct sock *sk)
* @wait_address: socket wait queue
* @p: poll_table
*
* See the comments in the sk_has_sleeper function.
* See the comments in the wq_has_sleeper function.
*/
static inline void sock_poll_wait(struct file *filp,
wait_queue_head_t *wait_address, poll_table *p)
Expand All @@ -1453,7 +1453,7 @@ static inline void sock_poll_wait(struct file *filp,
* We need to be sure we are in sync with the
* socket flags modification.
*
* This memory barrier is paired in the sk_has_sleeper.
* This memory barrier is paired in the wq_has_sleeper.
*/
smp_mb();
}
Expand Down
22 changes: 14 additions & 8 deletions net/atm/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,13 @@ static void vcc_sock_destruct(struct sock *sk)

static void vcc_def_wakeup(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
if (sk_has_sleeper(sk))
wake_up(sk_sleep(sk));
read_unlock(&sk->sk_callback_lock);
struct socket_wq *wq;

rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
wake_up(&wq->wait);
rcu_read_unlock();
}

static inline int vcc_writable(struct sock *sk)
Expand All @@ -106,16 +109,19 @@ static inline int vcc_writable(struct sock *sk)

static void vcc_write_space(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
struct socket_wq *wq;

rcu_read_lock();

if (vcc_writable(sk)) {
if (sk_has_sleeper(sk))
wake_up_interruptible(sk_sleep(sk));
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
wake_up_interruptible(&wq->wait);

sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}

read_unlock(&sk->sk_callback_lock);
rcu_read_unlock();
}

static struct proto vcc_proto = {
Expand Down
Loading

0 comments on commit 4381548

Please sign in to comment.