Skip to content

Commit

Permalink
RDMA/erdma: Refactor the storage structure of MTT entries
Browse files Browse the repository at this point in the history
Currently our MTT only support inline mtt entries (0 level MTT) and
indirect MTT entries (1 level mtt), which will limit the maximum length
of MRs. In order to implement a multi-level MTT, we refactor the
structure of MTT first.

Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230817102151.75964-3-chengyou@linux.alibaba.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
  • Loading branch information
Cheng Xu authored and Leon Romanovsky committed Aug 19, 2023
1 parent d7cfbba commit 7244b4a
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 94 deletions.
4 changes: 2 additions & 2 deletions drivers/infiniband/hw/erdma/erdma_hw.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ struct erdma_cmdq_ext_db_req {

/* create_cq cfg1 */
#define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16)
#define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15)
#define ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK BIT(15)
#define ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK BIT(11)
#define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0)

Expand Down Expand Up @@ -258,7 +258,7 @@ struct erdma_cmdq_create_cq_req {

/* regmr cfg2 */
#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27)
#define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20)
#define ERDMA_CMD_REGMR_MTT_LEVEL_MASK GENMASK(21, 20)
#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0)

struct erdma_cmdq_reg_mr_req {
Expand Down
2 changes: 1 addition & 1 deletion drivers/infiniband/hw/erdma/erdma_qp.c
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi,
/* Copy SGLs to SQE content to accelerate */
memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1,
qp->attrs.sq_size, SQEBB_SHIFT),
mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents));
mr->mem.mtt->buf, MTT_SIZE(mr->mem.mtt_nents));
wqe_size = sizeof(struct erdma_reg_mr_sqe) +
MTT_SIZE(mr->mem.mtt_nents);
} else {
Expand Down
214 changes: 128 additions & 86 deletions drivers/infiniband/hw/erdma/erdma_verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,23 @@
#include "erdma_cm.h"
#include "erdma_verbs.h"

static void assemble_qbuf_mtt_for_cmd(struct erdma_mem *mem, u32 *cfg,
u64 *addr0, u64 *addr1)
{
struct erdma_mtt *mtt = mem->mtt;

if (mem->mtt_nents > ERDMA_MAX_INLINE_MTT_ENTRIES) {
*addr0 = mtt->buf_dma;
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
ERDMA_MR_INDIRECT_MTT);
} else {
*addr0 = mtt->buf[0];
memcpy(addr1, mtt->buf + 1, MTT_SIZE(mem->mtt_nents - 1));
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
ERDMA_MR_INLINE_MTT);
}
}

static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)
{
struct erdma_dev *dev = to_edev(qp->ibqp.device);
Expand Down Expand Up @@ -79,18 +96,16 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)

req.sq_mtt_cfg = user_qp->sq_mem.page_offset;
req.sq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK,
user_qp->sq_mem.mtt_nents) |
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
user_qp->sq_mem.mtt_type);
user_qp->sq_mem.mtt_nents);

req.rq_mtt_cfg = user_qp->rq_mem.page_offset;
req.rq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK,
user_qp->rq_mem.mtt_nents) |
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
user_qp->rq_mem.mtt_type);
user_qp->rq_mem.mtt_nents);

req.sq_buf_addr = user_qp->sq_mem.mtt_entry[0];
req.rq_buf_addr = user_qp->rq_mem.mtt_entry[0];
assemble_qbuf_mtt_for_cmd(&user_qp->sq_mem, &req.sq_mtt_cfg,
&req.sq_buf_addr, req.sq_mtt_entry);
assemble_qbuf_mtt_for_cmd(&user_qp->rq_mem, &req.rq_mtt_cfg,
&req.rq_buf_addr, req.rq_mtt_entry);

req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr;
req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr;
Expand All @@ -117,13 +132,22 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)

static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
{
struct erdma_cmdq_reg_mr_req req;
struct erdma_pd *pd = to_epd(mr->ibmr.pd);
u64 *phy_addr;
int i;
struct erdma_cmdq_reg_mr_req req;
u32 mtt_level;

erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_REG_MR);

if (mr->type == ERDMA_MR_TYPE_FRMR ||
mr->mem.page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES) {
req.phy_addr[0] = mr->mem.mtt->buf_dma;
mtt_level = ERDMA_MR_INDIRECT_MTT;
} else {
memcpy(req.phy_addr, mr->mem.mtt->buf,
MTT_SIZE(mr->mem.page_cnt));
mtt_level = ERDMA_MR_INLINE_MTT;
}

req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) |
FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, mr->ibmr.lkey & 0xFF) |
FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8);
Expand All @@ -132,7 +156,7 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access);
req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK,
ilog2(mr->mem.page_size)) |
FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) |
FIELD_PREP(ERDMA_CMD_REGMR_MTT_LEVEL_MASK, mtt_level) |
FIELD_PREP(ERDMA_CMD_REGMR_MTT_CNT_MASK, mr->mem.page_cnt);

if (mr->type == ERDMA_MR_TYPE_DMA)
Expand All @@ -143,16 +167,6 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
req.size = mr->mem.len;
}

if (mr->type == ERDMA_MR_TYPE_FRMR ||
mr->mem.mtt_type == ERDMA_MR_INDIRECT_MTT) {
phy_addr = req.phy_addr;
*phy_addr = mr->mem.mtt_entry[0];
} else {
phy_addr = req.phy_addr;
for (i = 0; i < mr->mem.mtt_nents; i++)
*phy_addr++ = mr->mem.mtt_entry[i];
}

post_cmd:
return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL);
}
Expand All @@ -179,7 +193,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr);

req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) |
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK,
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
ERDMA_MR_INLINE_MTT);

req.first_page_offset = 0;
Expand All @@ -191,16 +205,20 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK,
ilog2(mem->page_size) - ERDMA_HW_PAGE_SHIFT);
if (mem->mtt_nents == 1) {
req.qbuf_addr_l = lower_32_bits(*(u64 *)mem->mtt_buf);
req.qbuf_addr_h = upper_32_bits(*(u64 *)mem->mtt_buf);
req.qbuf_addr_l = lower_32_bits(mem->mtt->buf[0]);
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf[0]);
req.cfg1 |=
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
ERDMA_MR_INLINE_MTT);
} else {
req.qbuf_addr_l = lower_32_bits(mem->mtt_entry[0]);
req.qbuf_addr_h = upper_32_bits(mem->mtt_entry[0]);
req.qbuf_addr_l = lower_32_bits(mem->mtt->buf_dma);
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf_dma);
req.cfg1 |=
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
ERDMA_MR_INDIRECT_MTT);
}
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK,
mem->mtt_nents);
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK,
mem->mtt_type);

req.first_page_offset = mem->page_offset;
req.cq_db_info_addr = cq->user_cq.db_info_dma_addr;
Expand Down Expand Up @@ -508,12 +526,77 @@ static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp,
return -ENOMEM;
}

static void erdma_fill_bottom_mtt(struct erdma_dev *dev, struct erdma_mem *mem)
{
struct erdma_mtt *mtt = mem->mtt;
struct ib_block_iter biter;
u32 idx = 0;

while (mtt->low_level)
mtt = mtt->low_level;

rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size)
mtt->buf[idx++] = rdma_block_iter_dma_address(&biter);
}

static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev,
size_t size)
{
struct erdma_mtt *mtt;
int ret = -ENOMEM;

mtt = kzalloc(sizeof(*mtt), GFP_KERNEL);
if (!mtt)
return ERR_PTR(-ENOMEM);

mtt->size = size;
mtt->buf = kzalloc(mtt->size, GFP_KERNEL);
if (!mtt->buf)
goto err_free_mtt;

mtt->continuous = true;
mtt->buf_dma = dma_map_single(&dev->pdev->dev, mtt->buf, mtt->size,
DMA_TO_DEVICE);
if (dma_mapping_error(&dev->pdev->dev, mtt->buf_dma))
goto err_free_mtt_buf;

return mtt;

err_free_mtt_buf:
kfree(mtt->buf);

err_free_mtt:
kfree(mtt);

return ERR_PTR(ret);
}

static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size,
bool force_continuous)
{
ibdev_dbg(&dev->ibdev, "create_mtt, size:%lu, force cont:%d\n", size,
force_continuous);

if (force_continuous)
return erdma_create_cont_mtt(dev, size);

return ERR_PTR(-EOPNOTSUPP);
}

static void erdma_destroy_mtt(struct erdma_dev *dev, struct erdma_mtt *mtt)
{
if (mtt->continuous) {
dma_unmap_single(&dev->pdev->dev, mtt->buf_dma, mtt->size,
DMA_TO_DEVICE);
kfree(mtt->buf);
kfree(mtt);
}
}

static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
u64 start, u64 len, int access, u64 virt,
unsigned long req_page_size, u8 force_indirect_mtt)
{
struct ib_block_iter biter;
uint64_t *phy_addr = NULL;
int ret = 0;

mem->umem = ib_umem_get(&dev->ibdev, start, len, access);
Expand All @@ -529,38 +612,13 @@ static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
mem->page_offset = start & (mem->page_size - 1);
mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size);
mem->page_cnt = mem->mtt_nents;

if (mem->page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES ||
force_indirect_mtt) {
mem->mtt_type = ERDMA_MR_INDIRECT_MTT;
mem->mtt_buf =
alloc_pages_exact(MTT_SIZE(mem->page_cnt), GFP_KERNEL);
if (!mem->mtt_buf) {
ret = -ENOMEM;
goto error_ret;
}
phy_addr = mem->mtt_buf;
} else {
mem->mtt_type = ERDMA_MR_INLINE_MTT;
phy_addr = mem->mtt_entry;
}

rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) {
*phy_addr = rdma_block_iter_dma_address(&biter);
phy_addr++;
mem->mtt = erdma_create_mtt(dev, MTT_SIZE(mem->page_cnt), true);
if (IS_ERR(mem->mtt)) {
ret = PTR_ERR(mem->mtt);
goto error_ret;
}

if (mem->mtt_type == ERDMA_MR_INDIRECT_MTT) {
mem->mtt_entry[0] =
dma_map_single(&dev->pdev->dev, mem->mtt_buf,
MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE);
if (dma_mapping_error(&dev->pdev->dev, mem->mtt_entry[0])) {
free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt));
mem->mtt_buf = NULL;
ret = -ENOMEM;
goto error_ret;
}
}
erdma_fill_bottom_mtt(dev, mem);

return 0;

Expand All @@ -575,11 +633,8 @@ static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,

static void put_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem)
{
if (mem->mtt_buf) {
dma_unmap_single(&dev->pdev->dev, mem->mtt_entry[0],
MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE);
free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt));
}
if (mem->mtt)
erdma_destroy_mtt(dev, mem->mtt);

if (mem->umem) {
ib_umem_release(mem->umem);
Expand Down Expand Up @@ -875,33 +930,20 @@ struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,

mr->mem.page_size = PAGE_SIZE; /* update it later. */
mr->mem.page_cnt = max_num_sg;
mr->mem.mtt_type = ERDMA_MR_INDIRECT_MTT;
mr->mem.mtt_buf =
alloc_pages_exact(MTT_SIZE(mr->mem.page_cnt), GFP_KERNEL);
if (!mr->mem.mtt_buf) {
ret = -ENOMEM;
mr->mem.mtt = erdma_create_mtt(dev, MTT_SIZE(max_num_sg), true);
if (IS_ERR(mr->mem.mtt)) {
ret = PTR_ERR(mr->mem.mtt);
goto out_remove_stag;
}

mr->mem.mtt_entry[0] =
dma_map_single(&dev->pdev->dev, mr->mem.mtt_buf,
MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE);
if (dma_mapping_error(&dev->pdev->dev, mr->mem.mtt_entry[0])) {
ret = -ENOMEM;
goto out_free_mtt;
}

ret = regmr_cmd(dev, mr);
if (ret)
goto out_dma_unmap;
goto out_destroy_mtt;

return &mr->ibmr;

out_dma_unmap:
dma_unmap_single(&dev->pdev->dev, mr->mem.mtt_entry[0],
MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE);
out_free_mtt:
free_pages_exact(mr->mem.mtt_buf, MTT_SIZE(mr->mem.page_cnt));
out_destroy_mtt:
erdma_destroy_mtt(dev, mr->mem.mtt);

out_remove_stag:
erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX],
Expand All @@ -920,7 +962,7 @@ static int erdma_set_page(struct ib_mr *ibmr, u64 addr)
if (mr->mem.mtt_nents >= mr->mem.page_cnt)
return -1;

*((u64 *)mr->mem.mtt_buf + mr->mem.mtt_nents) = addr;
mr->mem.mtt->buf[mr->mem.mtt_nents] = addr;
mr->mem.mtt_nents++;

return 0;
Expand Down
26 changes: 21 additions & 5 deletions drivers/infiniband/hw/erdma/erdma_verbs.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ struct erdma_pd {
* MemoryRegion definition.
*/
#define ERDMA_MAX_INLINE_MTT_ENTRIES 4
#define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt entry takes 8 Bytes. */
#define MTT_SIZE(mtt_cnt) ((mtt_cnt) << 3) /* per mtt entry takes 8 Bytes. */
#define ERDMA_MR_MAX_MTT_CNT 524288
#define ERDMA_MTT_ENTRY_SIZE 8

Expand All @@ -90,19 +90,35 @@ static inline u8 to_erdma_access_flags(int access)
(access & IB_ACCESS_REMOTE_ATOMIC ? ERDMA_MR_ACC_RA : 0);
}

/* Hierarchical storage structure for MTT entries */
struct erdma_mtt {
u64 *buf;
size_t size;

bool continuous;
union {
dma_addr_t buf_dma;
struct {
struct scatterlist *sglist;
u32 nsg;
u32 level;
};
};

struct erdma_mtt *low_level;
};

struct erdma_mem {
struct ib_umem *umem;
void *mtt_buf;
u32 mtt_type;
struct erdma_mtt *mtt;

u32 page_size;
u32 page_offset;
u32 page_cnt;
u32 mtt_nents;

u64 va;
u64 len;

u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES];
};

struct erdma_mr {
Expand Down

0 comments on commit 7244b4a

Please sign in to comment.