Skip to content

Commit

Permalink
RDMA/cxgb3: Fix severe limit on userspace memory registration size
Browse files Browse the repository at this point in the history
Currently, iw_cxgb3 is severely limited on the amount of userspace
memory that can be registered in in a single memory region, which
causes big problems for applications that expect to be able to
register 100s of MB.

The problem is that the driver uses a single kmalloc()ed buffer to
hold the physical buffer list (PBL) for the entire memory region
during registration, which means that 8 bytes of contiguous memory are
required for each page of memory being registered.  For example, a 64
MB registration will require 128 KB of contiguous memory with 4 KB
pages, and it unlikely that such an allocation will succeed on a busy
system.

This is purely a driver problem: the temporary page list buffer is not
needed by the hardware, so we can fix this by writing the PBL to the
hardware in page-sized chunks rather than all at once.  We do this by
splitting the memory registration operation up into several steps:

 - Allocate PBL space in adapter memory for the full registration
 - Copy PBL to adapter memory in chunks
 - Allocate STag and enable memory region

This also allows several other cleanups to the __cxio_tpt_op()
interface and related parts of the driver.

This change leaves the reregister memory region and memory window
operations broken, but they already didn't work due to other
longstanding bugs, so fixing them will be left to a later patch.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
  • Loading branch information
Roland Dreier committed May 6, 2008
1 parent 0e99133 commit 273748c
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 94 deletions.
90 changes: 45 additions & 45 deletions drivers/infiniband/hw/cxgb3/cxio_hal.c
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p)
* caller aquires the ctrl_qp lock before the call
*/
static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
u32 len, void *data, int completion)
u32 len, void *data)
{
u32 i, nr_wqe, copy_len;
u8 *copy_data;
Expand Down Expand Up @@ -624,7 +624,7 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
flag = 0;
if (i == (nr_wqe - 1)) {
/* last WQE */
flag = completion ? T3_COMPLETION_FLAG : 0;
flag = T3_COMPLETION_FLAG;
if (len % 32)
utx_len = len / 32 + 1;
else
Expand Down Expand Up @@ -683,21 +683,20 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
return 0;
}

/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl, and pbl_size
* OUT: stag index, actual pbl_size, pbl_addr allocated.
/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr
* OUT: stag index
* TBD: shared memory region support
*/
static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
u32 *stag, u8 stag_state, u32 pdid,
enum tpt_mem_type type, enum tpt_mem_perm perm,
u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl,
u32 *pbl_size, u32 *pbl_addr)
u32 zbva, u64 to, u32 len, u8 page_size,
u32 pbl_size, u32 pbl_addr)
{
int err;
struct tpt_entry tpt;
u32 stag_idx;
u32 wptr;
int rereg = (*stag != T3_STAG_UNSET);

stag_state = stag_state > 0;
stag_idx = (*stag) >> 8;
Expand All @@ -711,30 +710,8 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
__func__, stag_state, type, pdid, stag_idx);

if (reset_tpt_entry)
cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3);
else if (!rereg) {
*pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3);
if (!*pbl_addr) {
return -ENOMEM;
}
}

mutex_lock(&rdev_p->ctrl_qp.lock);

/* write PBL first if any - update pbl only if pbl list exist */
if (pbl) {

PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
__func__, *pbl_addr, rdev_p->rnic_info.pbl_base,
*pbl_size);
err = cxio_hal_ctrl_qp_write_mem(rdev_p,
(*pbl_addr >> 5),
(*pbl_size << 3), pbl, 0);
if (err)
goto ret;
}

/* write TPT entry */
if (reset_tpt_entry)
memset(&tpt, 0, sizeof(tpt));
Expand All @@ -749,23 +726,23 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |
V_TPT_PAGE_SIZE(page_size));
tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 :
cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3));
cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3));
tpt.len = cpu_to_be32(len);
tpt.va_hi = cpu_to_be32((u32) (to >> 32));
tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL));
tpt.rsvd_bind_cnt_or_pstag = 0;
tpt.rsvd_pbl_size = reset_tpt_entry ? 0 :
cpu_to_be32(V_TPT_PBL_SIZE((*pbl_size) >> 2));
cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2));
}
err = cxio_hal_ctrl_qp_write_mem(rdev_p,
stag_idx +
(rdev_p->rnic_info.tpt_base >> 5),
sizeof(tpt), &tpt, 1);
sizeof(tpt), &tpt);

/* release the stag index to free pool */
if (reset_tpt_entry)
cxio_hal_put_stag(rdev_p->rscp, stag_idx);
ret:

wptr = rdev_p->ctrl_qp.wptr;
mutex_unlock(&rdev_p->ctrl_qp.lock);
if (!err)
Expand All @@ -776,44 +753,67 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
return err;
}

int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
u32 pbl_addr, u32 pbl_size)
{
u32 wptr;
int err;

PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
__func__, pbl_addr, rdev_p->rnic_info.pbl_base,
pbl_size);

mutex_lock(&rdev_p->ctrl_qp.lock);
err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3,
pbl);
wptr = rdev_p->ctrl_qp.wptr;
mutex_unlock(&rdev_p->ctrl_qp.lock);
if (err)
return err;

if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
SEQ32_GE(rdev_p->ctrl_qp.rptr,
wptr)))
return -ERESTARTSYS;

return 0;
}

int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
u8 page_size, __be64 *pbl, u32 *pbl_size,
u32 *pbl_addr)
u8 page_size, u32 pbl_size, u32 pbl_addr)
{
*stag = T3_STAG_UNSET;
return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
zbva, to, len, page_size, pbl_size, pbl_addr);
}

int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
u8 page_size, __be64 *pbl, u32 *pbl_size,
u32 *pbl_addr)
u8 page_size, u32 pbl_size, u32 pbl_addr)
{
return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
zbva, to, len, page_size, pbl_size, pbl_addr);
}

int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size,
u32 pbl_addr)
{
return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
&pbl_size, &pbl_addr);
return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
pbl_size, pbl_addr);
}

int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid)
{
u32 pbl_size = 0;
*stag = T3_STAG_UNSET;
return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0,
NULL, &pbl_size, NULL);
0, 0);
}

int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
{
return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
NULL, NULL);
return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
0, 0);
}

int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
Expand Down
8 changes: 4 additions & 4 deletions drivers/infiniband/hw/cxgb3/cxio_hal.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,14 +154,14 @@ int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq,
struct cxio_ucontext *uctx);
int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode);
int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
u32 pbl_addr, u32 pbl_size);
int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
u8 page_size, __be64 *pbl, u32 *pbl_size,
u32 *pbl_addr);
u8 page_size, u32 pbl_size, u32 pbl_addr);
int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
u8 page_size, __be64 *pbl, u32 *pbl_size,
u32 *pbl_addr);
u8 page_size, u32 pbl_size, u32 pbl_addr);
int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
u32 pbl_addr);
int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
Expand Down
75 changes: 49 additions & 26 deletions drivers/infiniband/hw/cxgb3/iwch_mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,46 +35,48 @@
#include <rdma/ib_verbs.h>

#include "cxio_hal.h"
#include "cxio_resource.h"
#include "iwch.h"
#include "iwch_provider.h"

int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
struct iwch_mr *mhp,
int shift,
__be64 *page_list)
static void iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag)
{
u32 stag;
u32 mmid;

mhp->attr.state = 1;
mhp->attr.stag = stag;
mmid = stag >> 8;
mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
}

int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
struct iwch_mr *mhp, int shift)
{
u32 stag;

if (cxio_register_phys_mem(&rhp->rdev,
&stag, mhp->attr.pdid,
mhp->attr.perms,
mhp->attr.zbva,
mhp->attr.va_fbo,
mhp->attr.len,
shift-12,
page_list,
&mhp->attr.pbl_size, &mhp->attr.pbl_addr))
shift - 12,
mhp->attr.pbl_size, mhp->attr.pbl_addr))
return -ENOMEM;
mhp->attr.state = 1;
mhp->attr.stag = stag;
mmid = stag >> 8;
mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
insert_handle(rhp, &rhp->mmidr, mhp, mmid);
PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);

iwch_finish_mem_reg(mhp, stag);

return 0;
}

int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
struct iwch_mr *mhp,
int shift,
__be64 *page_list,
int npages)
{
u32 stag;
u32 mmid;


/* We could support this... */
if (npages > mhp->attr.pbl_size)
Expand All @@ -87,19 +89,40 @@ int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
mhp->attr.zbva,
mhp->attr.va_fbo,
mhp->attr.len,
shift-12,
page_list,
&mhp->attr.pbl_size, &mhp->attr.pbl_addr))
shift - 12,
mhp->attr.pbl_size, mhp->attr.pbl_addr))
return -ENOMEM;
mhp->attr.state = 1;
mhp->attr.stag = stag;
mmid = stag >> 8;
mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
insert_handle(rhp, &rhp->mmidr, mhp, mmid);
PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);

iwch_finish_mem_reg(mhp, stag);

return 0;
}

int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
{
mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
npages << 3);

if (!mhp->attr.pbl_addr)
return -ENOMEM;

mhp->attr.pbl_size = npages;

return 0;
}

void iwch_free_pbl(struct iwch_mr *mhp)
{
cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
mhp->attr.pbl_size << 3);
}

int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
{
return cxio_write_pbl(&mhp->rhp->rdev, pages,
mhp->attr.pbl_addr + (offset << 3), npages);
}

int build_phys_page_list(struct ib_phys_buf *buffer_list,
int num_phys_buf,
u64 *iova_start,
Expand Down
Loading

0 comments on commit 273748c

Please sign in to comment.