Skip to content

Commit

Permalink
IB/uverbs: Export ib_umem_get()/ib_umem_release() to modules
Browse files Browse the repository at this point in the history
Export ib_umem_get()/ib_umem_release() and put low-level drivers in
control of when to call ib_umem_get() to pin and DMA map userspace,
rather than always calling it in ib_uverbs_reg_mr() before calling the
low-level driver's reg_user_mr method.

Also move these functions to be in the ib_core module instead of
ib_uverbs, so that driver modules using them do not depend on
ib_uverbs.

This has a number of advantages:
 - It is better design from the standpoint of making generic code a
   library that can be used or overridden by device-specific code as
   the details of specific devices dictate.
 - Drivers that do not need to pin userspace memory regions do not
   need to take the performance hit of calling ib_mem_get().  For
   example, although I have not tried to implement it in this patch,
   the ipath driver should be able to avoid pinning memory and just
   use copy_{to,from}_user() to access userspace memory regions.
 - Buffers that need special mapping treatment can be identified by
   the low-level driver.  For example, it may be possible to solve
   some Altix-specific memory ordering issues with mthca CQs in
   userspace by mapping CQ buffers with extra flags.
 - Drivers that need to pin and DMA map userspace memory for things
   other than memory regions can use ib_umem_get() directly, instead
   of hacks using extra parameters to their reg_phys_mr method.  For
   example, the mlx4 driver that is pending being merged needs to pin
   and DMA map QP and CQ buffers, but it does not need to create a
   memory key for these buffers.  So the cleanest solution is for mlx4
   to call ib_umem_get() in the create_qp and create_cq methods.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
  • Loading branch information
Roland Dreier committed May 9, 2007
1 parent 36f021b commit f7c6a7b
Show file tree
Hide file tree
Showing 20 changed files with 355 additions and 202 deletions.
5 changes: 5 additions & 0 deletions drivers/infiniband/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ config INFINIBAND_USER_ACCESS
libibverbs, libibcm and a hardware driver library from
<http://www.openib.org>.

config INFINIBAND_USER_MEM
bool
depends on INFINIBAND_USER_ACCESS != n
default y

config INFINIBAND_ADDR_TRANS
bool
depends on INFINIBAND && INET
Expand Down
4 changes: 2 additions & 2 deletions drivers/infiniband/core/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \

ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
device.o fmr_pool.o cache.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o

ib_mad-y := mad.o smi.o agent.o mad_rmpp.o

Expand All @@ -28,5 +29,4 @@ ib_umad-y := user_mad.o

ib_ucm-y := ucm.o

ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_mem.o \
uverbs_marshall.o
ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o
2 changes: 2 additions & 0 deletions drivers/infiniband/core/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,8 @@ static void __exit ib_core_cleanup(void)
{
ib_cache_cleanup();
ib_sysfs_cleanup();
/* Make sure that any pending umem accounting work is done. */
flush_scheduled_work();
}

module_init(ib_core_init);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,35 +64,56 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
}
}

int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
void *addr, size_t size, int write)
/**
* ib_umem_get - Pin and DMA map userspace memory.
* @context: userspace context to pin memory for
* @addr: userspace virtual address to start at
* @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned
*/
struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
size_t size, int access)
{
struct ib_umem *umem;
struct page **page_list;
struct ib_umem_chunk *chunk;
unsigned long locked;
unsigned long lock_limit;
unsigned long cur_base;
unsigned long npages;
int ret = 0;
int ret;
int off;
int i;

if (!can_do_mlock())
return -EPERM;
return ERR_PTR(-EPERM);

page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
umem = kmalloc(sizeof *umem, GFP_KERNEL);
if (!umem)
return ERR_PTR(-ENOMEM);

mem->user_base = (unsigned long) addr;
mem->length = size;
mem->offset = (unsigned long) addr & ~PAGE_MASK;
mem->page_size = PAGE_SIZE;
mem->writable = write;
umem->context = context;
umem->length = size;
umem->offset = addr & ~PAGE_MASK;
umem->page_size = PAGE_SIZE;
/*
* We ask for writable memory if any access flags other than
* "remote read" are set. "Local write" and "remote write"
* obviously require write access. "Remote atomic" can do
* things like fetch and add, which will modify memory, and
* "MW bind" can change permissions by binding a window.
*/
umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);

INIT_LIST_HEAD(&mem->chunk_list);
INIT_LIST_HEAD(&umem->chunk_list);

page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list) {
kfree(umem);
return ERR_PTR(-ENOMEM);
}

npages = PAGE_ALIGN(size + mem->offset) >> PAGE_SHIFT;
npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;

down_write(&current->mm->mmap_sem);

Expand All @@ -104,13 +125,13 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
goto out;
}

cur_base = (unsigned long) addr & PAGE_MASK;
cur_base = addr & PAGE_MASK;

while (npages) {
ret = get_user_pages(current, current->mm, cur_base,
min_t(int, npages,
PAGE_SIZE / sizeof (struct page *)),
1, !write, page_list, NULL);
1, !umem->writable, page_list, NULL);

if (ret < 0)
goto out;
Expand All @@ -136,7 +157,7 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
chunk->page_list[i].length = PAGE_SIZE;
}

chunk->nmap = ib_dma_map_sg(dev,
chunk->nmap = ib_dma_map_sg(context->device,
&chunk->page_list[0],
chunk->nents,
DMA_BIDIRECTIONAL);
Expand All @@ -151,33 +172,25 @@ int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,

ret -= chunk->nents;
off += chunk->nents;
list_add_tail(&chunk->list, &mem->chunk_list);
list_add_tail(&chunk->list, &umem->chunk_list);
}

ret = 0;
}

out:
if (ret < 0)
__ib_umem_release(dev, mem, 0);
else
if (ret < 0) {
__ib_umem_release(context->device, umem, 0);
kfree(umem);
} else
current->mm->locked_vm = locked;

up_write(&current->mm->mmap_sem);
free_page((unsigned long) page_list);

return ret;
}

void ib_umem_release(struct ib_device *dev, struct ib_umem *umem)
{
__ib_umem_release(dev, umem, 1);

down_write(&current->mm->mmap_sem);
current->mm->locked_vm -=
PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
up_write(&current->mm->mmap_sem);
return ret < 0 ? ERR_PTR(ret) : umem;
}
EXPORT_SYMBOL(ib_umem_get);

static void ib_umem_account(struct work_struct *_work)
{
Expand All @@ -191,35 +204,70 @@ static void ib_umem_account(struct work_struct *_work)
kfree(work);
}

void ib_umem_release_on_close(struct ib_device *dev, struct ib_umem *umem)
/**
* ib_umem_release - release memory pinned with ib_umem_get
* @umem: umem struct to release
*/
void ib_umem_release(struct ib_umem *umem)
{
struct ib_umem_account_work *work;
struct ib_ucontext *context = umem->context;
struct mm_struct *mm;
unsigned long diff;

__ib_umem_release(dev, umem, 1);
__ib_umem_release(umem->context->device, umem, 1);

mm = get_task_mm(current);
if (!mm)
return;

diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
kfree(umem);

/*
* We may be called with the mm's mmap_sem already held. This
* can happen when a userspace munmap() is the call that drops
* the last reference to our file and calls our release
* method. If there are memory regions to destroy, we'll end
* up here and not be able to take the mmap_sem. Therefore we
* defer the vm_locked accounting to the system workqueue.
* up here and not be able to take the mmap_sem. In that case
* we defer the vm_locked accounting to the system workqueue.
*/
if (context->closing && !down_write_trylock(&mm->mmap_sem)) {
work = kmalloc(sizeof *work, GFP_KERNEL);
if (!work) {
mmput(mm);
return;
}

work = kmalloc(sizeof *work, GFP_KERNEL);
if (!work) {
mmput(mm);
INIT_WORK(&work->work, ib_umem_account);
work->mm = mm;
work->diff = diff;

schedule_work(&work->work);
return;
}
} else
down_write(&mm->mmap_sem);

current->mm->locked_vm -= diff;
up_write(&mm->mmap_sem);
mmput(mm);
}
EXPORT_SYMBOL(ib_umem_release);

int ib_umem_page_count(struct ib_umem *umem)
{
struct ib_umem_chunk *chunk;
int shift;
int i;
int n;

shift = ilog2(umem->page_size);

INIT_WORK(&work->work, ib_umem_account);
work->mm = mm;
work->diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
n = 0;
list_for_each_entry(chunk, &umem->chunk_list, list)
for (i = 0; i < chunk->nmap; ++i)
n += sg_dma_len(&chunk->page_list[i]) >> shift;

schedule_work(&work->work);
return n;
}
EXPORT_SYMBOL(ib_umem_page_count);
6 changes: 1 addition & 5 deletions drivers/infiniband/core/uverbs.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include <linux/completion.h>

#include <rdma/ib_verbs.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>

/*
Expand Down Expand Up @@ -163,11 +164,6 @@ void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_event_handler(struct ib_event_handler *handler,
struct ib_event *event);

int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
void *addr, size_t size, int write);
void ib_umem_release(struct ib_device *dev, struct ib_umem *umem);
void ib_umem_release_on_close(struct ib_device *dev, struct ib_umem *umem);

#define IB_UVERBS_DECLARE_CMD(name) \
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
const char __user *buf, int in_len, \
Expand Down
Loading

0 comments on commit f7c6a7b

Please sign in to comment.