Skip to content

Commit

Permalink
IB/mlx4: Handle new FW requirement for send request prefetching
Browse files Browse the repository at this point in the history
New ConnectX firmware introduces FW command interface revision 2,
which requires that for each QP, a chunk of send queue entries (the
"headroom") is kept marked as invalid, so that the HCA doesn't get
confused if it prefetches entries that haven't been posted yet.  Add
code to the driver to do this, and also update the user ABI so that
userspace can request that the prefetcher be turned off for userspace
QPs (we just leave the prefetcher on for all kernel QPs).

Unfortunately, marking send queue entries this way is confuses older
firmware, so we change the driver to allow only FW command interface
revisions 2.  This means that users will have to update their firmware
to work with the new driver, but the firmware is changing quickly and
the old firmware has lots of other bugs anyway, so this shouldn't be too
big a deal.

Based on a patch from Jack Morgenstein <jackm@dev.mellanox.co.il>.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
  • Loading branch information
Roland Dreier committed Jun 18, 2007
1 parent 42c059e commit 0e6e741
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 38 deletions.
4 changes: 2 additions & 2 deletions drivers/infiniband/hw/mlx4/cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
wq = &(*cur_qp)->sq;
wqe_ctr = be16_to_cpu(cqe->wqe_index);
wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
wc->wr_id = wq->wrid[wq->tail & (wq->max - 1)];
wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
++wq->tail;
} else if ((*cur_qp)->ibqp.srq) {
srq = to_msrq((*cur_qp)->ibqp.srq);
Expand All @@ -364,7 +364,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
mlx4_ib_free_srq_wqe(srq, wqe_ctr);
} else {
wq = &(*cur_qp)->rq;
wc->wr_id = wq->wrid[wq->tail & (wq->max - 1)];
wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
++wq->tail;
}

Expand Down
5 changes: 4 additions & 1 deletion drivers/infiniband/hw/mlx4/mlx4_ib.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ struct mlx4_ib_mr {
struct mlx4_ib_wq {
u64 *wrid;
spinlock_t lock;
int max;
int wqe_cnt;
int max_post;
int max_gs;
int offset;
int wqe_shift;
Expand All @@ -113,6 +114,7 @@ struct mlx4_ib_qp {

u32 doorbell_qpn;
__be32 sq_signal_bits;
int sq_spare_wqes;
struct mlx4_ib_wq sq;

struct ib_umem *umem;
Expand All @@ -123,6 +125,7 @@ struct mlx4_ib_qp {
u8 alt_port;
u8 atomic_rd_en;
u8 resp_depth;
u8 sq_no_prefetch;
u8 state;
};

Expand Down
101 changes: 71 additions & 30 deletions drivers/infiniband/hw/mlx4/qp.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,20 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
}

/*
* Stamp a SQ WQE so that it is invalid if prefetched by marking the
* first four bytes of every 64 byte chunk with 0xffffffff, except for
* the very first chunk of the WQE.
*/
static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
{
u32 *wqe = get_send_wqe(qp, n);
int i;

for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
wqe[i] = 0xffffffff;
}

static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
{
struct ib_event event;
Expand Down Expand Up @@ -201,18 +215,18 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
if (cap->max_recv_wr)
return -EINVAL;

qp->rq.max = qp->rq.max_gs = 0;
qp->rq.wqe_cnt = qp->rq.max_gs = 0;
} else {
/* HW requires >= 1 RQ entry with >= 1 gather entry */
if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
return -EINVAL;

qp->rq.max = roundup_pow_of_two(max(1U, cap->max_recv_wr));
qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr));
qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge));
qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
}

cap->max_recv_wr = qp->rq.max;
cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt;
cap->max_recv_sge = qp->rq.max_gs;

return 0;
Expand All @@ -236,8 +250,6 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
return -EINVAL;

qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 1;

qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
sizeof (struct mlx4_wqe_data_seg),
cap->max_inline_data +
Expand All @@ -246,18 +258,25 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
sizeof (struct mlx4_wqe_data_seg);

qp->buf_size = (qp->rq.max << qp->rq.wqe_shift) +
(qp->sq.max << qp->sq.wqe_shift);
/*
* We need to leave 2 KB + 1 WQE of headroom in the SQ to
* allow HW to prefetch.
*/
qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes);

qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
(qp->sq.wqe_cnt << qp->sq.wqe_shift);
if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
qp->rq.offset = 0;
qp->sq.offset = qp->rq.max << qp->rq.wqe_shift;
qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
} else {
qp->rq.offset = qp->sq.max << qp->sq.wqe_shift;
qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
qp->sq.offset = 0;
}

cap->max_send_wr = qp->sq.max;
cap->max_send_sge = qp->sq.max_gs;
cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;
cap->max_send_sge = qp->sq.max_gs;
cap->max_inline_data = (1 << qp->sq.wqe_shift) - send_wqe_overhead(type) -
sizeof (struct mlx4_wqe_inline_seg);

Expand All @@ -267,11 +286,11 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
static int set_user_sq_size(struct mlx4_ib_qp *qp,
struct mlx4_ib_create_qp *ucmd)
{
qp->sq.max = 1 << ucmd->log_sq_bb_count;
qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count;
qp->sq.wqe_shift = ucmd->log_sq_stride;

qp->buf_size = (qp->rq.max << qp->rq.wqe_shift) +
(qp->sq.max << qp->sq.wqe_shift);
qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
(qp->sq.wqe_cnt << qp->sq.wqe_shift);

return 0;
}
Expand Down Expand Up @@ -307,6 +326,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
goto err;
}

qp->sq_no_prefetch = ucmd.sq_no_prefetch;

err = set_user_sq_size(qp, &ucmd);
if (err)
goto err;
Expand Down Expand Up @@ -334,6 +355,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
goto err_mtt;
}
} else {
qp->sq_no_prefetch = 0;

err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
if (err)
goto err;
Expand All @@ -360,8 +383,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
if (err)
goto err_mtt;

qp->sq.wrid = kmalloc(qp->sq.max * sizeof (u64), GFP_KERNEL);
qp->rq.wrid = kmalloc(qp->rq.max * sizeof (u64), GFP_KERNEL);
qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL);
qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL);

if (!qp->sq.wrid || !qp->rq.wrid) {
err = -ENOMEM;
Expand Down Expand Up @@ -743,14 +766,17 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
context->mtu_msgmax = (attr->path_mtu << 5) | 31;
}

if (qp->rq.max)
context->rq_size_stride = ilog2(qp->rq.max) << 3;
if (qp->rq.wqe_cnt)
context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
context->rq_size_stride |= qp->rq.wqe_shift - 4;

if (qp->sq.max)
context->sq_size_stride = ilog2(qp->sq.max) << 3;
if (qp->sq.wqe_cnt)
context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
context->sq_size_stride |= qp->sq.wqe_shift - 4;

if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
context->sq_size_stride |= !!qp->sq_no_prefetch << 7;

if (qp->ibqp.uobject)
context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
else
Expand Down Expand Up @@ -884,16 +910,19 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,

/*
* Before passing a kernel QP to the HW, make sure that the
* ownership bits of the send queue are set so that the
* hardware doesn't start processing stale work requests.
* ownership bits of the send queue are set and the SQ
* headroom is stamped so that the hardware doesn't start
* processing stale work requests.
*/
if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
struct mlx4_wqe_ctrl_seg *ctrl;
int i;

for (i = 0; i < qp->sq.max; ++i) {
for (i = 0; i < qp->sq.wqe_cnt; ++i) {
ctrl = get_send_wqe(qp, i);
ctrl->owner_opcode = cpu_to_be32(1 << 31);

stamp_send_wqe(qp, i);
}
}

Expand Down Expand Up @@ -1124,15 +1153,15 @@ static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq
struct mlx4_ib_cq *cq;

cur = wq->head - wq->tail;
if (likely(cur + nreq < wq->max))
if (likely(cur + nreq < wq->max_post))
return 0;

cq = to_mcq(ib_cq);
spin_lock(&cq->lock);
cur = wq->head - wq->tail;
spin_unlock(&cq->lock);

return cur + nreq >= wq->max;
return cur + nreq >= wq->max_post;
}

int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
Expand Down Expand Up @@ -1165,8 +1194,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
goto out;
}

ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.max - 1));
qp->sq.wrid[ind & (qp->sq.max - 1)] = wr->wr_id;
ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;

ctrl->srcrb_flags =
(wr->send_flags & IB_SEND_SIGNALED ?
Expand Down Expand Up @@ -1301,7 +1330,16 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
}

ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
(ind & qp->sq.max ? cpu_to_be32(1 << 31) : 0);
(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);

/*
* We can improve latency by not stamping the last
* send queue WQE until after ringing the doorbell, so
* only stamp here if there are still more WQEs to post.
*/
if (wr->next)
stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
(qp->sq.wqe_cnt - 1));

++ind;
}
Expand All @@ -1324,6 +1362,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
* and reach the HCA out of order.
*/
mmiowb();

stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
(qp->sq.wqe_cnt - 1));
}

spin_unlock_irqrestore(&qp->rq.lock, flags);
Expand All @@ -1344,7 +1385,7 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,

spin_lock_irqsave(&qp->rq.lock, flags);

ind = qp->rq.head & (qp->rq.max - 1);
ind = qp->rq.head & (qp->rq.wqe_cnt - 1);

for (nreq = 0; wr; ++nreq, wr = wr->next) {
if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.send_cq)) {
Expand Down Expand Up @@ -1375,7 +1416,7 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,

qp->rq.wrid[ind] = wr->wr_id;

ind = (ind + 1) & (qp->rq.max - 1);
ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
}

out:
Expand Down
9 changes: 5 additions & 4 deletions drivers/infiniband/hw/mlx4/user.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
* Increment this value if any changes that break userspace ABI
* compatibility are made.
*/
#define MLX4_IB_UVERBS_ABI_VERSION 2
#define MLX4_IB_UVERBS_ABI_VERSION 3

/*
* Make sure that all structs defined in this file remain laid out so
Expand Down Expand Up @@ -87,9 +87,10 @@ struct mlx4_ib_create_srq_resp {
struct mlx4_ib_create_qp {
__u64 buf_addr;
__u64 db_addr;
__u8 log_sq_bb_count;
__u8 log_sq_stride;
__u8 reserved[6];
__u8 log_sq_bb_count;
__u8 log_sq_stride;
__u8 sq_no_prefetch;
__u8 reserved[5];
};

#endif /* MLX4_IB_USER_H */
2 changes: 1 addition & 1 deletion drivers/net/mlx4/fw.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
#include "icm.h"

enum {
MLX4_COMMAND_INTERFACE_REV = 1
MLX4_COMMAND_INTERFACE_REV = 2,
};

extern void __buggy_use_of_MLX4_GET(void);
Expand Down

0 comments on commit 0e6e741

Please sign in to comment.