Skip to content

Commit

Permalink
IB/ipath: Fix RC and UC error handling
Browse files Browse the repository at this point in the history
When errors are detected in RC, the QP should transition to the
IB_QPS_ERR state, not the IB_QPS_SQE state. Also, when the error is on
the responder side, the receive work completion error was incorrect
(remote vs. local).

Signed-off-by: Ralph Campbell <ralph.campbell@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
  • Loading branch information
Ralph Campbell authored and Roland Dreier committed May 13, 2008
1 parent dd37818 commit 53dc1ca
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 224 deletions.
54 changes: 8 additions & 46 deletions drivers/infiniband/hw/ipath/ipath_qp.c
Original file line number Diff line number Diff line change
Expand Up @@ -374,13 +374,14 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
}

/**
* ipath_error_qp - put a QP into an error state
* @qp: the QP to put into an error state
* ipath_error_qp - put a QP into the error state
* @qp: the QP to put into the error state
* @err: the receive completion error to signal if a RWQE is active
*
* Flushes both send and receive work queues.
* Returns true if last WQE event should be generated.
* The QP s_lock should be held and interrupts disabled.
* If we are already in error state, just return.
*/

int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
Expand All @@ -389,8 +390,10 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
struct ib_wc wc;
int ret = 0;

ipath_dbg("QP%d/%d in error state (%d)\n",
qp->ibqp.qp_num, qp->remote_qpn, err);
if (qp->state == IB_QPS_ERR)
goto bail;

qp->state = IB_QPS_ERR;

spin_lock(&dev->pending_lock);
if (!list_empty(&qp->timerwait))
Expand Down Expand Up @@ -460,6 +463,7 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
} else if (qp->ibqp.event_handler)
ret = 1;

bail:
return ret;
}

Expand Down Expand Up @@ -1025,48 +1029,6 @@ int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
return ret;
}

/**
* ipath_sqerror_qp - put a QP's send queue into an error state
* @qp: QP who's send queue will be put into an error state
* @wc: the WC responsible for putting the QP in this state
*
* Flushes the send work queue.
* The QP s_lock should be held and interrupts disabled.
*/

void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc)
{
struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);

ipath_dbg("Send queue error on QP%d/%d: err: %d\n",
qp->ibqp.qp_num, qp->remote_qpn, wc->status);

spin_lock(&dev->pending_lock);
if (!list_empty(&qp->timerwait))
list_del_init(&qp->timerwait);
if (!list_empty(&qp->piowait))
list_del_init(&qp->piowait);
spin_unlock(&dev->pending_lock);

ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1);
if (++qp->s_last >= qp->s_size)
qp->s_last = 0;

wc->status = IB_WC_WR_FLUSH_ERR;

while (qp->s_last != qp->s_head) {
wqe = get_swqe_ptr(qp, qp->s_last);
wc->wr_id = wqe->wr.wr_id;
wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1);
if (++qp->s_last >= qp->s_size)
qp->s_last = 0;
}
qp->s_cur = qp->s_tail = qp->s_head;
qp->state = IB_QPS_SQE;
}

/**
* ipath_get_credit - flush the send work queue of a QP
* @qp: the qp who's send work queue to flush
Expand Down
127 changes: 41 additions & 86 deletions drivers/infiniband/hw/ipath/ipath_rc.c
Original file line number Diff line number Diff line change
Expand Up @@ -771,27 +771,14 @@ static void reset_psn(struct ipath_qp *qp, u32 psn)
*
* The QP s_lock should be held and interrupts disabled.
*/
void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
{
struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
struct ipath_ibdev *dev;

if (qp->s_retry == 0) {
wc->wr_id = wqe->wr.wr_id;
wc->status = IB_WC_RETRY_EXC_ERR;
wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
wc->vendor_err = 0;
wc->byte_len = 0;
wc->qp = &qp->ibqp;
wc->imm_data = 0;
wc->src_qp = qp->remote_qpn;
wc->wc_flags = 0;
wc->pkey_index = 0;
wc->slid = qp->remote_ah_attr.dlid;
wc->sl = qp->remote_ah_attr.sl;
wc->dlid_path_bits = 0;
wc->port_num = 0;
ipath_sqerror_qp(qp, wc);
ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
goto bail;
}
qp->s_retry--;
Expand All @@ -804,6 +791,8 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
spin_lock(&dev->pending_lock);
if (!list_empty(&qp->timerwait))
list_del_init(&qp->timerwait);
if (!list_empty(&qp->piowait))
list_del_init(&qp->piowait);
spin_unlock(&dev->pending_lock);

if (wqe->wr.opcode == IB_WR_RDMA_READ)
Expand Down Expand Up @@ -845,6 +834,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
{
struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
struct ib_wc wc;
enum ib_wc_status status;
struct ipath_swqe *wqe;
int ret = 0;
u32 ack_psn;
Expand Down Expand Up @@ -909,7 +899,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
*/
update_last_psn(qp, wqe->psn - 1);
/* Retry this request. */
ipath_restart_rc(qp, wqe->psn, &wc);
ipath_restart_rc(qp, wqe->psn);
/*
* No need to process the ACK/NAK since we are
* restarting an earlier request.
Expand Down Expand Up @@ -937,20 +927,15 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
/* Post a send completion queue entry if requested. */
if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
(wqe->wr.send_flags & IB_SEND_SIGNALED)) {
memset(&wc, 0, sizeof wc);
wc.wr_id = wqe->wr.wr_id;
wc.status = IB_WC_SUCCESS;
wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
wc.vendor_err = 0;
wc.byte_len = wqe->length;
wc.imm_data = 0;
wc.qp = &qp->ibqp;
wc.src_qp = qp->remote_qpn;
wc.wc_flags = 0;
wc.pkey_index = 0;
wc.slid = qp->remote_ah_attr.dlid;
wc.sl = qp->remote_ah_attr.sl;
wc.dlid_path_bits = 0;
wc.port_num = 0;
ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
}
qp->s_retry = qp->s_retry_cnt;
Expand Down Expand Up @@ -1012,7 +997,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
if (qp->s_last == qp->s_tail)
goto bail;
if (qp->s_rnr_retry == 0) {
wc.status = IB_WC_RNR_RETRY_EXC_ERR;
status = IB_WC_RNR_RETRY_EXC_ERR;
goto class_b;
}
if (qp->s_rnr_retry_cnt < 7)
Expand Down Expand Up @@ -1050,37 +1035,25 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
* RDMA READ response which terminates the RDMA
* READ.
*/
ipath_restart_rc(qp, psn, &wc);
ipath_restart_rc(qp, psn);
break;

case 1: /* Invalid Request */
wc.status = IB_WC_REM_INV_REQ_ERR;
status = IB_WC_REM_INV_REQ_ERR;
dev->n_other_naks++;
goto class_b;

case 2: /* Remote Access Error */
wc.status = IB_WC_REM_ACCESS_ERR;
status = IB_WC_REM_ACCESS_ERR;
dev->n_other_naks++;
goto class_b;

case 3: /* Remote Operation Error */
wc.status = IB_WC_REM_OP_ERR;
status = IB_WC_REM_OP_ERR;
dev->n_other_naks++;
class_b:
wc.wr_id = wqe->wr.wr_id;
wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
wc.vendor_err = 0;
wc.byte_len = 0;
wc.qp = &qp->ibqp;
wc.imm_data = 0;
wc.src_qp = qp->remote_qpn;
wc.wc_flags = 0;
wc.pkey_index = 0;
wc.slid = qp->remote_ah_attr.dlid;
wc.sl = qp->remote_ah_attr.sl;
wc.dlid_path_bits = 0;
wc.port_num = 0;
ipath_sqerror_qp(qp, &wc);
ipath_send_complete(qp, wqe, status);
ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
break;

default:
Expand Down Expand Up @@ -1126,8 +1099,8 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
int header_in_data)
{
struct ipath_swqe *wqe;
enum ib_wc_status status;
unsigned long flags;
struct ib_wc wc;
int diff;
u32 pad;
u32 aeth;
Expand Down Expand Up @@ -1159,6 +1132,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
if (unlikely(qp->s_last == qp->s_tail))
goto ack_done;
wqe = get_swqe_ptr(qp, qp->s_last);
status = IB_WC_SUCCESS;

switch (opcode) {
case OP(ACKNOWLEDGE):
Expand Down Expand Up @@ -1200,7 +1174,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
/* no AETH, no ACK */
if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
dev->n_rdma_seq++;
ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
ipath_restart_rc(qp, qp->s_last_psn + 1);
goto ack_done;
}
if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
Expand Down Expand Up @@ -1261,7 +1235,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
/* ACKs READ req. */
if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
dev->n_rdma_seq++;
ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
ipath_restart_rc(qp, qp->s_last_psn + 1);
goto ack_done;
}
if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
Expand Down Expand Up @@ -1291,31 +1265,16 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
goto ack_done;
}

ack_done:
spin_unlock_irqrestore(&qp->s_lock, flags);
goto bail;

ack_op_err:
wc.status = IB_WC_LOC_QP_OP_ERR;
status = IB_WC_LOC_QP_OP_ERR;
goto ack_err;

ack_len_err:
wc.status = IB_WC_LOC_LEN_ERR;
status = IB_WC_LOC_LEN_ERR;
ack_err:
wc.wr_id = wqe->wr.wr_id;
wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
wc.vendor_err = 0;
wc.byte_len = 0;
wc.imm_data = 0;
wc.qp = &qp->ibqp;
wc.src_qp = qp->remote_qpn;
wc.wc_flags = 0;
wc.pkey_index = 0;
wc.slid = qp->remote_ah_attr.dlid;
wc.sl = qp->remote_ah_attr.sl;
wc.dlid_path_bits = 0;
wc.port_num = 0;
ipath_sqerror_qp(qp, &wc);
ipath_send_complete(qp, wqe, status);
ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
ack_done:
spin_unlock_irqrestore(&qp->s_lock, flags);
bail:
return;
Expand Down Expand Up @@ -1523,13 +1482,12 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
return 0;
}

static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
{
unsigned long flags;
int lastwqe;

spin_lock_irqsave(&qp->s_lock, flags);
qp->state = IB_QPS_ERR;
lastwqe = ipath_error_qp(qp, err);
spin_unlock_irqrestore(&qp->s_lock, flags);

Expand Down Expand Up @@ -1643,11 +1601,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
opcode == OP(SEND_LAST) ||
opcode == OP(SEND_LAST_WITH_IMMEDIATE))
break;
nack_inv:
ipath_rc_error(qp, IB_WC_REM_INV_REQ_ERR);
qp->r_nak_state = IB_NAK_INVALID_REQUEST;
qp->r_ack_psn = qp->r_psn;
goto send_ack;
goto nack_inv;

case OP(RDMA_WRITE_FIRST):
case OP(RDMA_WRITE_MIDDLE):
Expand All @@ -1673,18 +1627,13 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
break;
}

wc.imm_data = 0;
wc.wc_flags = 0;
memset(&wc, 0, sizeof wc);

/* OK, process the packet. */
switch (opcode) {
case OP(SEND_FIRST):
if (!ipath_get_rwqe(qp, 0)) {
rnr_nak:
qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
qp->r_ack_psn = qp->r_psn;
goto send_ack;
}
if (!ipath_get_rwqe(qp, 0))
goto rnr_nak;
qp->r_rcv_len = 0;
/* FALLTHROUGH */
case OP(SEND_MIDDLE):
Expand Down Expand Up @@ -1751,14 +1700,10 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
else
wc.opcode = IB_WC_RECV;
wc.vendor_err = 0;
wc.qp = &qp->ibqp;
wc.src_qp = qp->remote_qpn;
wc.pkey_index = 0;
wc.slid = qp->remote_ah_attr.dlid;
wc.sl = qp->remote_ah_attr.sl;
wc.dlid_path_bits = 0;
wc.port_num = 0;
/* Signal completion event if the solicited bit is set. */
ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
(ohdr->bth[0] &
Expand Down Expand Up @@ -1951,11 +1896,21 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
goto send_ack;
goto done;

rnr_nak:
qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
qp->r_ack_psn = qp->r_psn;
goto send_ack;

nack_inv:
ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
qp->r_nak_state = IB_NAK_INVALID_REQUEST;
qp->r_ack_psn = qp->r_psn;
goto send_ack;

nack_acc:
ipath_rc_error(qp, IB_WC_REM_ACCESS_ERR);
ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
qp->r_ack_psn = qp->r_psn;

send_ack:
send_rc_ack(qp);

Expand Down
Loading

0 comments on commit 53dc1ca

Please sign in to comment.