Skip to content

Commit

Permalink
netlink: mmaped netlink: ring setup
Browse files Browse the repository at this point in the history
Add support for mmap'ed RX and TX ring setup and teardown based on the
af_packet.c code. The following patches will use this to add the real
mmap'ed receive and transmit functionality.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Patrick McHardy authored and David S. Miller committed Apr 19, 2013
1 parent cf0a018 commit ccdfcc3
Show file tree
Hide file tree
Showing 4 changed files with 327 additions and 2 deletions.
32 changes: 32 additions & 0 deletions include/uapi/linux/netlink.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _UAPI__LINUX_NETLINK_H
#define _UAPI__LINUX_NETLINK_H

#include <linux/kernel.h>
#include <linux/socket.h> /* for __kernel_sa_family_t */
#include <linux/types.h>

Expand Down Expand Up @@ -105,11 +106,42 @@ struct nlmsgerr {
#define NETLINK_PKTINFO 3
#define NETLINK_BROADCAST_ERROR 4
#define NETLINK_NO_ENOBUFS 5
#define NETLINK_RX_RING 6
#define NETLINK_TX_RING 7

struct nl_pktinfo {
__u32 group;
};

struct nl_mmap_req {
unsigned int nm_block_size;
unsigned int nm_block_nr;
unsigned int nm_frame_size;
unsigned int nm_frame_nr;
};

struct nl_mmap_hdr {
unsigned int nm_status;
unsigned int nm_len;
__u32 nm_group;
/* credentials */
__u32 nm_pid;
__u32 nm_uid;
__u32 nm_gid;
};

enum nl_mmap_status {
NL_MMAP_STATUS_UNUSED,
NL_MMAP_STATUS_RESERVED,
NL_MMAP_STATUS_VALID,
NL_MMAP_STATUS_COPY,
NL_MMAP_STATUS_SKIP,
};

#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO
#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))

#define NET_MAJOR 36 /* Major 36 is reserved for networking */

enum {
Expand Down
9 changes: 9 additions & 0 deletions net/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ menuconfig NET

if NET

config NETLINK_MMAP
bool "Netlink: mmaped IO"
help
This option enables support for memory mapped netlink IO. This
reduces overhead by avoiding copying data between kernel- and
userspace.

If unsure, say N.

config WANT_COMPAT_NETLINK_MESSAGES
bool
help
Expand Down
268 changes: 266 additions & 2 deletions net/netlink/af_netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include <linux/types.h>
#include <linux/audit.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>

#include <net/net_namespace.h>
#include <net/sock.h>
Expand Down Expand Up @@ -107,6 +108,234 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u
return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];
}

#ifdef CONFIG_NETLINK_MMAP
static __pure struct page *pgvec_to_page(const void *addr)
{
if (is_vmalloc_addr(addr))
return vmalloc_to_page(addr);
else
return virt_to_page(addr);
}

static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
{
unsigned int i;

for (i = 0; i < len; i++) {
if (pg_vec[i] != NULL) {
if (is_vmalloc_addr(pg_vec[i]))
vfree(pg_vec[i]);
else
free_pages((unsigned long)pg_vec[i], order);
}
}
kfree(pg_vec);
}

static void *alloc_one_pg_vec_page(unsigned long order)
{
void *buffer;
gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
__GFP_NOWARN | __GFP_NORETRY;

buffer = (void *)__get_free_pages(gfp_flags, order);
if (buffer != NULL)
return buffer;

buffer = vzalloc((1 << order) * PAGE_SIZE);
if (buffer != NULL)
return buffer;

gfp_flags &= ~__GFP_NORETRY;
return (void *)__get_free_pages(gfp_flags, order);
}

static void **alloc_pg_vec(struct netlink_sock *nlk,
struct nl_mmap_req *req, unsigned int order)
{
unsigned int block_nr = req->nm_block_nr;
unsigned int i;
void **pg_vec, *ptr;

pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
if (pg_vec == NULL)
return NULL;

for (i = 0; i < block_nr; i++) {
pg_vec[i] = ptr = alloc_one_pg_vec_page(order);
if (pg_vec[i] == NULL)
goto err1;
}

return pg_vec;
err1:
free_pg_vec(pg_vec, order, block_nr);
return NULL;
}

static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
bool closing, bool tx_ring)
{
struct netlink_sock *nlk = nlk_sk(sk);
struct netlink_ring *ring;
struct sk_buff_head *queue;
void **pg_vec = NULL;
unsigned int order = 0;
int err;

ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

if (!closing) {
if (atomic_read(&nlk->mapped))
return -EBUSY;
if (atomic_read(&ring->pending))
return -EBUSY;
}

if (req->nm_block_nr) {
if (ring->pg_vec != NULL)
return -EBUSY;

if ((int)req->nm_block_size <= 0)
return -EINVAL;
if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE))
return -EINVAL;
if (req->nm_frame_size < NL_MMAP_HDRLEN)
return -EINVAL;
if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
return -EINVAL;

ring->frames_per_block = req->nm_block_size /
req->nm_frame_size;
if (ring->frames_per_block == 0)
return -EINVAL;
if (ring->frames_per_block * req->nm_block_nr !=
req->nm_frame_nr)
return -EINVAL;

order = get_order(req->nm_block_size);
pg_vec = alloc_pg_vec(nlk, req, order);
if (pg_vec == NULL)
return -ENOMEM;
} else {
if (req->nm_frame_nr)
return -EINVAL;
}

err = -EBUSY;
mutex_lock(&nlk->pg_vec_lock);
if (closing || atomic_read(&nlk->mapped) == 0) {
err = 0;
spin_lock_bh(&queue->lock);

ring->frame_max = req->nm_frame_nr - 1;
ring->head = 0;
ring->frame_size = req->nm_frame_size;
ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE;

swap(ring->pg_vec_len, req->nm_block_nr);
swap(ring->pg_vec_order, order);
swap(ring->pg_vec, pg_vec);

__skb_queue_purge(queue);
spin_unlock_bh(&queue->lock);

WARN_ON(atomic_read(&nlk->mapped));
}
mutex_unlock(&nlk->pg_vec_lock);

if (pg_vec)
free_pg_vec(pg_vec, order, req->nm_block_nr);
return err;
}

static void netlink_mm_open(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
struct socket *sock = file->private_data;
struct sock *sk = sock->sk;

if (sk)
atomic_inc(&nlk_sk(sk)->mapped);
}

static void netlink_mm_close(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
struct socket *sock = file->private_data;
struct sock *sk = sock->sk;

if (sk)
atomic_dec(&nlk_sk(sk)->mapped);
}

static const struct vm_operations_struct netlink_mmap_ops = {
.open = netlink_mm_open,
.close = netlink_mm_close,
};

static int netlink_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
struct netlink_ring *ring;
unsigned long start, size, expected;
unsigned int i;
int err = -EINVAL;

if (vma->vm_pgoff)
return -EINVAL;

mutex_lock(&nlk->pg_vec_lock);

expected = 0;
for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
if (ring->pg_vec == NULL)
continue;
expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
}

if (expected == 0)
goto out;

size = vma->vm_end - vma->vm_start;
if (size != expected)
goto out;

start = vma->vm_start;
for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
if (ring->pg_vec == NULL)
continue;

for (i = 0; i < ring->pg_vec_len; i++) {
struct page *page;
void *kaddr = ring->pg_vec[i];
unsigned int pg_num;

for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
page = pgvec_to_page(kaddr);
err = vm_insert_page(vma, start, page);
if (err < 0)
goto out;
start += PAGE_SIZE;
kaddr += PAGE_SIZE;
}
}
}

atomic_inc(&nlk->mapped);
vma->vm_ops = &netlink_mmap_ops;
err = 0;
out:
mutex_unlock(&nlk->pg_vec_lock);
return 0;
}
#else /* CONFIG_NETLINK_MMAP */
#define netlink_mmap sock_no_mmap
#endif /* CONFIG_NETLINK_MMAP */

static void netlink_destroy_callback(struct netlink_callback *cb)
{
kfree_skb(cb->skb);
Expand Down Expand Up @@ -146,6 +375,18 @@ static void netlink_sock_destruct(struct sock *sk)
}

skb_queue_purge(&sk->sk_receive_queue);
#ifdef CONFIG_NETLINK_MMAP
if (1) {
struct nl_mmap_req req;

memset(&req, 0, sizeof(req));
if (nlk->rx_ring.pg_vec)
netlink_set_ring(sk, &req, true, false);
memset(&req, 0, sizeof(req));
if (nlk->tx_ring.pg_vec)
netlink_set_ring(sk, &req, true, true);
}
#endif /* CONFIG_NETLINK_MMAP */

if (!sock_flag(sk, SOCK_DEAD)) {
printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
Expand Down Expand Up @@ -409,6 +650,9 @@ static int __netlink_create(struct net *net, struct socket *sock,
mutex_init(nlk->cb_mutex);
}
init_waitqueue_head(&nlk->wait);
#ifdef CONFIG_NETLINK_MMAP
mutex_init(&nlk->pg_vec_lock);
#endif

sk->sk_destruct = netlink_sock_destruct;
sk->sk_protocol = protocol;
Expand Down Expand Up @@ -1211,7 +1455,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
if (level != SOL_NETLINK)
return -ENOPROTOOPT;

if (optlen >= sizeof(int) &&
if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
optlen >= sizeof(int) &&
get_user(val, (unsigned int __user *)optval))
return -EFAULT;

Expand Down Expand Up @@ -1260,6 +1505,25 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
}
err = 0;
break;
#ifdef CONFIG_NETLINK_MMAP
case NETLINK_RX_RING:
case NETLINK_TX_RING: {
struct nl_mmap_req req;

/* Rings might consume more memory than queue limits, require
* CAP_NET_ADMIN.
*/
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (optlen < sizeof(req))
return -EINVAL;
if (copy_from_user(&req, optval, sizeof(req)))
return -EFAULT;
err = netlink_set_ring(sk, &req, false,
optname == NETLINK_TX_RING);
break;
}
#endif /* CONFIG_NETLINK_MMAP */
default:
err = -ENOPROTOOPT;
}
Expand Down Expand Up @@ -2093,7 +2357,7 @@ static const struct proto_ops netlink_ops = {
.getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg,
.recvmsg = netlink_recvmsg,
.mmap = sock_no_mmap,
.mmap = netlink_mmap,
.sendpage = sock_no_sendpage,
};

Expand Down
Loading

0 comments on commit ccdfcc3

Please sign in to comment.