Skip to content

Commit

Permalink
RDMA/erdma: Implement hierarchical MTT
Browse files Browse the repository at this point in the history
Hierarchical MTT allows large MR registration without the need of
continuous physical address. This commit adds the support of hierarchical
MTT support for erdma.

Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230817102151.75964-4-chengyou@linux.alibaba.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
  • Loading branch information
Cheng Xu authored and Leon Romanovsky committed Aug 19, 2023
1 parent 7244b4a commit ed10435
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 24 deletions.
14 changes: 12 additions & 2 deletions drivers/infiniband/hw/erdma/erdma_hw.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ struct erdma_cmdq_create_cq_req {

/* regmr/deregmr cfg0 */
#define ERDMA_CMD_MR_VALID_MASK BIT(31)
#define ERDMA_CMD_MR_VERSION_MASK GENMASK(30, 28)
#define ERDMA_CMD_MR_KEY_MASK GENMASK(27, 20)
#define ERDMA_CMD_MR_MPT_IDX_MASK GENMASK(19, 0)

Expand All @@ -258,6 +259,7 @@ struct erdma_cmdq_create_cq_req {

/* regmr cfg2 */
#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27)
#define ERDMA_CMD_REGMR_MTT_PAGESIZE_MASK GENMASK(26, 24)
#define ERDMA_CMD_REGMR_MTT_LEVEL_MASK GENMASK(21, 20)
#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0)

Expand All @@ -268,7 +270,14 @@ struct erdma_cmdq_reg_mr_req {
u64 start_va;
u32 size;
u32 cfg2;
u64 phy_addr[4];
union {
u64 phy_addr[4];
struct {
u64 rsvd;
u32 size_h;
u32 mtt_cnt_h;
};
};
};

struct erdma_cmdq_dereg_mr_req {
Expand Down Expand Up @@ -309,7 +318,7 @@ struct erdma_cmdq_modify_qp_req {
/* create qp mtt_cfg */
#define ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK GENMASK(31, 12)
#define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1)
#define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0)
#define ERDMA_CMD_CREATE_QP_MTT_LEVEL_MASK BIT(0)

/* create qp db cfg */
#define ERDMA_CMD_CREATE_QP_SQDB_CFG_MASK GENMASK(31, 16)
Expand Down Expand Up @@ -364,6 +373,7 @@ struct erdma_cmdq_reflush_req {

enum {
ERDMA_DEV_CAP_FLAGS_ATOMIC = 1 << 7,
ERDMA_DEV_CAP_FLAGS_MTT_VA = 1 << 5,
ERDMA_DEV_CAP_FLAGS_EXTEND_DB = 1 << 3,
};

Expand Down
200 changes: 180 additions & 20 deletions drivers/infiniband/hw/erdma/erdma_verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ static void assemble_qbuf_mtt_for_cmd(struct erdma_mem *mem, u32 *cfg,

if (mem->mtt_nents > ERDMA_MAX_INLINE_MTT_ENTRIES) {
*addr0 = mtt->buf_dma;
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
ERDMA_MR_INDIRECT_MTT);
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_LEVEL_MASK,
ERDMA_MR_MTT_1LEVEL);
} else {
*addr0 = mtt->buf[0];
memcpy(addr1, mtt->buf + 1, MTT_SIZE(mem->mtt_nents - 1));
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
ERDMA_MR_INLINE_MTT);
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_LEVEL_MASK,
ERDMA_MR_MTT_0LEVEL);
}
}

Expand Down Expand Up @@ -70,8 +70,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)
req.sq_mtt_cfg =
FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK, 0) |
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, 1) |
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
ERDMA_MR_INLINE_MTT);
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_LEVEL_MASK,
ERDMA_MR_MTT_0LEVEL);
req.rq_mtt_cfg = req.sq_mtt_cfg;

req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr;
Expand Down Expand Up @@ -140,12 +140,17 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)

if (mr->type == ERDMA_MR_TYPE_FRMR ||
mr->mem.page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES) {
req.phy_addr[0] = mr->mem.mtt->buf_dma;
mtt_level = ERDMA_MR_INDIRECT_MTT;
if (mr->mem.mtt->continuous) {
req.phy_addr[0] = mr->mem.mtt->buf_dma;
mtt_level = ERDMA_MR_MTT_1LEVEL;
} else {
req.phy_addr[0] = sg_dma_address(mr->mem.mtt->sglist);
mtt_level = mr->mem.mtt->level;
}
} else {
memcpy(req.phy_addr, mr->mem.mtt->buf,
MTT_SIZE(mr->mem.page_cnt));
mtt_level = ERDMA_MR_INLINE_MTT;
mtt_level = ERDMA_MR_MTT_0LEVEL;
}

req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) |
Expand All @@ -167,6 +172,14 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
req.size = mr->mem.len;
}

if (!mr->mem.mtt->continuous && mr->mem.mtt->level > 1) {
req.cfg0 |= FIELD_PREP(ERDMA_CMD_MR_VERSION_MASK, 1);
req.cfg2 |= FIELD_PREP(ERDMA_CMD_REGMR_MTT_PAGESIZE_MASK,
PAGE_SHIFT - ERDMA_HW_PAGE_SHIFT);
req.size_h = upper_32_bits(mr->mem.len);
req.mtt_cnt_h = mr->mem.page_cnt >> 20;
}

post_cmd:
return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL);
}
Expand Down Expand Up @@ -194,7 +207,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)

req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) |
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
ERDMA_MR_INLINE_MTT);
ERDMA_MR_MTT_0LEVEL);

req.first_page_offset = 0;
req.cq_db_info_addr =
Expand All @@ -209,13 +222,13 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf[0]);
req.cfg1 |=
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
ERDMA_MR_INLINE_MTT);
ERDMA_MR_MTT_0LEVEL);
} else {
req.qbuf_addr_l = lower_32_bits(mem->mtt->buf_dma);
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf_dma);
req.cfg1 |=
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
ERDMA_MR_INDIRECT_MTT);
ERDMA_MR_MTT_1LEVEL);
}
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK,
mem->mtt_nents);
Expand Down Expand Up @@ -543,7 +556,6 @@ static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev,
size_t size)
{
struct erdma_mtt *mtt;
int ret = -ENOMEM;

mtt = kzalloc(sizeof(*mtt), GFP_KERNEL);
if (!mtt)
Expand All @@ -565,6 +577,104 @@ static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev,
err_free_mtt_buf:
kfree(mtt->buf);

err_free_mtt:
kfree(mtt);

return ERR_PTR(-ENOMEM);
}

static void erdma_destroy_mtt_buf_sg(struct erdma_dev *dev,
struct erdma_mtt *mtt)
{
dma_unmap_sg(&dev->pdev->dev, mtt->sglist, mtt->nsg, DMA_TO_DEVICE);
vfree(mtt->sglist);
}

static void erdma_destroy_scatter_mtt(struct erdma_dev *dev,
struct erdma_mtt *mtt)
{
erdma_destroy_mtt_buf_sg(dev, mtt);
vfree(mtt->buf);
kfree(mtt);
}

static void erdma_init_middle_mtt(struct erdma_mtt *mtt,
struct erdma_mtt *low_mtt)
{
struct scatterlist *sg;
u32 idx = 0, i;

for_each_sg(low_mtt->sglist, sg, low_mtt->nsg, i)
mtt->buf[idx++] = sg_dma_address(sg);
}

static int erdma_create_mtt_buf_sg(struct erdma_dev *dev, struct erdma_mtt *mtt)
{
struct scatterlist *sglist;
void *buf = mtt->buf;
u32 npages, i, nsg;
struct page *pg;

/* Failed if buf is not page aligned */
if ((uintptr_t)buf & ~PAGE_MASK)
return -EINVAL;

npages = DIV_ROUND_UP(mtt->size, PAGE_SIZE);
sglist = vzalloc(npages * sizeof(*sglist));
if (!sglist)
return -ENOMEM;

sg_init_table(sglist, npages);
for (i = 0; i < npages; i++) {
pg = vmalloc_to_page(buf);
if (!pg)
goto err;
sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
buf += PAGE_SIZE;
}

nsg = dma_map_sg(&dev->pdev->dev, sglist, npages, DMA_TO_DEVICE);
if (!nsg)
goto err;

mtt->sglist = sglist;
mtt->nsg = nsg;

return 0;
err:
vfree(sglist);

return -ENOMEM;
}

static struct erdma_mtt *erdma_create_scatter_mtt(struct erdma_dev *dev,
size_t size)
{
struct erdma_mtt *mtt;
int ret = -ENOMEM;

mtt = kzalloc(sizeof(*mtt), GFP_KERNEL);
if (!mtt)
return NULL;

mtt->size = ALIGN(size, PAGE_SIZE);
mtt->buf = vzalloc(mtt->size);
mtt->continuous = false;
if (!mtt->buf)
goto err_free_mtt;

ret = erdma_create_mtt_buf_sg(dev, mtt);
if (ret)
goto err_free_mtt_buf;

ibdev_dbg(&dev->ibdev, "create scatter mtt, size:%lu, nsg:%u\n",
mtt->size, mtt->nsg);

return mtt;

err_free_mtt_buf:
vfree(mtt->buf);

err_free_mtt:
kfree(mtt);

Expand All @@ -574,28 +684,77 @@ static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev,
static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size,
bool force_continuous)
{
struct erdma_mtt *mtt, *tmp_mtt;
int ret, level = 0;

ibdev_dbg(&dev->ibdev, "create_mtt, size:%lu, force cont:%d\n", size,
force_continuous);

if (!(dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_MTT_VA))
force_continuous = true;

if (force_continuous)
return erdma_create_cont_mtt(dev, size);

return ERR_PTR(-EOPNOTSUPP);
mtt = erdma_create_scatter_mtt(dev, size);
if (IS_ERR(mtt))
return mtt;
level = 1;

/* convergence the mtt table. */
while (mtt->nsg != 1 && level <= 3) {
tmp_mtt = erdma_create_scatter_mtt(dev, MTT_SIZE(mtt->nsg));
if (IS_ERR(tmp_mtt)) {
ret = PTR_ERR(tmp_mtt);
goto err_free_mtt;
}
erdma_init_middle_mtt(tmp_mtt, mtt);
tmp_mtt->low_level = mtt;
mtt = tmp_mtt;
level++;
}

if (level > 3) {
ret = -ENOMEM;
goto err_free_mtt;
}

mtt->level = level;
ibdev_dbg(&dev->ibdev, "top mtt: level:%d, dma_addr 0x%llx\n",
mtt->level, mtt->sglist[0].dma_address);

return mtt;
err_free_mtt:
while (mtt) {
tmp_mtt = mtt->low_level;
erdma_destroy_scatter_mtt(dev, mtt);
mtt = tmp_mtt;
}

return ERR_PTR(ret);
}

static void erdma_destroy_mtt(struct erdma_dev *dev, struct erdma_mtt *mtt)
{
struct erdma_mtt *tmp_mtt;

if (mtt->continuous) {
dma_unmap_single(&dev->pdev->dev, mtt->buf_dma, mtt->size,
DMA_TO_DEVICE);
kfree(mtt->buf);
kfree(mtt);
} else {
while (mtt) {
tmp_mtt = mtt->low_level;
erdma_destroy_scatter_mtt(dev, mtt);
mtt = tmp_mtt;
}
}
}

static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
u64 start, u64 len, int access, u64 virt,
unsigned long req_page_size, u8 force_indirect_mtt)
unsigned long req_page_size, bool force_continuous)
{
int ret = 0;

Expand All @@ -612,7 +771,8 @@ static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
mem->page_offset = start & (mem->page_size - 1);
mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size);
mem->page_cnt = mem->mtt_nents;
mem->mtt = erdma_create_mtt(dev, MTT_SIZE(mem->page_cnt), true);
mem->mtt = erdma_create_mtt(dev, MTT_SIZE(mem->page_cnt),
force_continuous);
if (IS_ERR(mem->mtt)) {
ret = PTR_ERR(mem->mtt);
goto error_ret;
Expand Down Expand Up @@ -717,7 +877,7 @@ static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx,

ret = get_mtt_entries(qp->dev, &qp->user_qp.sq_mem, va,
qp->attrs.sq_size << SQEBB_SHIFT, 0, va,
(SZ_1M - SZ_4K), 1);
(SZ_1M - SZ_4K), true);
if (ret)
return ret;

Expand All @@ -726,7 +886,7 @@ static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx,

ret = get_mtt_entries(qp->dev, &qp->user_qp.rq_mem, va + rq_offset,
qp->attrs.rq_size << RQE_SHIFT, 0, va + rq_offset,
(SZ_1M - SZ_4K), 1);
(SZ_1M - SZ_4K), true);
if (ret)
goto put_sq_mtt;

Expand Down Expand Up @@ -998,7 +1158,7 @@ struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
return ERR_PTR(-ENOMEM);

ret = get_mtt_entries(dev, &mr->mem, start, len, access, virt,
SZ_2G - SZ_4K, 0);
SZ_2G - SZ_4K, false);
if (ret)
goto err_out_free;

Expand Down Expand Up @@ -1423,7 +1583,7 @@ static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq,

ret = get_mtt_entries(dev, &cq->user_cq.qbuf_mem, ureq->qbuf_va,
ureq->qbuf_len, 0, ureq->qbuf_va, SZ_64M - SZ_4K,
1);
true);
if (ret)
return ret;

Expand Down
4 changes: 2 additions & 2 deletions drivers/infiniband/hw/erdma/erdma_verbs.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ struct erdma_pd {
#define ERDMA_MR_TYPE_FRMR 1
#define ERDMA_MR_TYPE_DMA 2

#define ERDMA_MR_INLINE_MTT 0
#define ERDMA_MR_INDIRECT_MTT 1
#define ERDMA_MR_MTT_0LEVEL 0
#define ERDMA_MR_MTT_1LEVEL 1

#define ERDMA_MR_ACC_RA BIT(0)
#define ERDMA_MR_ACC_LR BIT(1)
Expand Down

0 comments on commit ed10435

Please sign in to comment.