diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 23a58ac0d47cc..bd913701761de 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -185,6 +185,18 @@ static inline bool iowait_schedule(struct iowait *wait, return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork); } +/** + * iowait_tid_schedule - schedule the tid SE + * @wait: the iowait structure + * @wq: the work queue + * @cpu: the cpu + */ +static inline bool iowait_tid_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) +{ + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork); +} + /** * iowait_sdma_drain() - wait for DMAs to drain * diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 31b4b60f4364a..96632c77f36ff 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -431,6 +431,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp) if (ret) iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); } + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) { + ret = hfi1_schedule_tid_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } } void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) @@ -450,8 +455,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait) { - if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) + struct hfi1_qp_priv *priv = qp->priv; + + if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) { qp->s_flags &= ~RVT_S_BUSY; + /* + * If we are sending a first-leg packet from the second leg, + * we need to clear the busy flag from priv->s_flags to + * avoid a race condition when the qp wakes up before + * the call to hfi1_verbs_send() returns to the second + * leg. In that case, the second leg will terminate without + * being re-scheduled, resulting in failure to send TID RDMA + * WRITE DATA and TID RDMA ACK packets. + */ + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + priv->s_flags &= ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } + } else { + priv->s_flags &= ~RVT_S_BUSY; + } } static int iowait_sleep( @@ -694,7 +718,7 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) &priv->s_iowait, 1, _hfi1_do_send, - NULL, + _hfi1_do_tid_send, iowait_sleep, iowait_wakeup, iowait_sdma_drained); @@ -851,7 +875,8 @@ void notify_error_qp(struct rvt_qp *qp) if (lock) { write_seqlock(lock); if (!list_empty(&priv->s_iowait.list) && - !(qp->s_flags & RVT_S_BUSY)) { + !(qp->s_flags & RVT_S_BUSY) && + !(priv->s_flags & RVT_S_BUSY)) { qp->s_flags &= ~RVT_S_ANY_WAIT_IO; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; @@ -860,7 +885,8 @@ void notify_error_qp(struct rvt_qp *qp) write_sequnlock(lock); } - if (!(qp->s_flags & RVT_S_BUSY)) { + if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) { + qp->s_hdrwords = 0; if (qp->s_rdma_mr) { rvt_put_mr(qp->s_rdma_mr); qp->s_rdma_mr = NULL; diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index d531b760ea93f..b670321365d31 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -82,6 +82,7 @@ extern const struct rvt_operation_params hfi1_post_parms[]; #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN) #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) +#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA) /* * Send if not busy or waiting for I/O and either diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index f96c0f544cb00..124a3ec1e15c9 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -453,11 +453,13 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, #define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */ /** - * schedule_send_yield - test for a yield required for QP send engine + * hfi1_schedule_send_yield - test for a yield required for QP + * send engine * @timeout: Final time for timeout slice for jiffies * @qp: a pointer to QP * @ps: a pointer to a structure with commonly lookup values for * the the send engine progress + * @tid - true if it is the tid leg * * This routine checks if the time slice for the QP has expired * for RC QPs, if so an additional work entry is queued. At this @@ -465,8 +467,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, * returns true if a yield is required, otherwise, false * is returned. */ -static bool schedule_send_yield(struct rvt_qp *qp, - struct hfi1_pkt_state *ps) +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + bool tid) { ps->pkts_sent = true; @@ -474,8 +476,24 @@ static bool schedule_send_yield(struct rvt_qp *qp, if (!ps->in_thread || workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) { spin_lock_irqsave(&qp->s_lock, ps->flags); - qp->s_flags &= ~RVT_S_BUSY; - hfi1_schedule_send(qp); + if (!tid) { + qp->s_flags &= ~RVT_S_BUSY; + hfi1_schedule_send(qp); + } else { + struct hfi1_qp_priv *priv = qp->priv; + + if (priv->s_flags & + HFI1_S_TID_BUSY_SET) { + qp->s_flags &= ~RVT_S_BUSY; + priv->s_flags &= + ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + } else { + priv->s_flags &= ~RVT_S_BUSY; + } + hfi1_schedule_tid_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, ps->flags); this_cpu_inc(*ps->ppd->dd->send_schedule); trace_hfi1_rc_expired_time_slice(qp, true); @@ -576,6 +594,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) do { /* Check for a constructed packet to be sent. */ if (ps.s_txreq) { + if (priv->s_flags & HFI1_S_TID_BUSY_SET) + qp->s_flags |= RVT_S_BUSY; spin_unlock_irqrestore(&qp->s_lock, ps.flags); /* * If the packet cannot be sent now, return and @@ -585,7 +605,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) return; /* allow other tasks to run */ - if (schedule_send_yield(qp, &ps)) + if (hfi1_schedule_send_yield(qp, &ps, false)) return; spin_lock_irqsave(&qp->s_lock, ps.flags); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 15c12243c1662..80111dd1d8767 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -127,6 +127,7 @@ static void hfi1_tid_retry_timeout(struct timer_list *t); static int make_tid_rdma_ack(struct rvt_qp *qp, struct ib_other_headers *ohdr, struct hfi1_pkt_state *ps); +static void hfi1_do_tid_send(struct rvt_qp *qp); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -3048,6 +3049,7 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, qpriv->s_flags |= RVT_S_ACK_PENDING; if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID) qpriv->r_tid_ack = qpriv->r_tid_tail; + hfi1_schedule_tid_send(qp); } goto unlock; } @@ -3517,6 +3519,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) ret = -EAGAIN; to_seg = MAX_FLOWS >> 1; qpriv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); break; } @@ -4128,6 +4131,7 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) } qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; + hfi1_schedule_tid_send(qp); goto ack_done; ack_op_err: @@ -4287,6 +4291,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) done: priv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); exit: priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; spin_unlock_irqrestore(&qp->s_lock, flags); @@ -4299,6 +4304,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) priv->s_flags |= RVT_S_ACK_PENDING; if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) priv->r_tid_ack = priv->r_tid_tail; + hfi1_schedule_tid_send(qp); } goto done; } @@ -4567,6 +4573,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) req->cur_seg = req->ack_seg; qpriv->s_tid_tail = qp->s_acked; qpriv->s_state = TID_OP(WRITE_REQ); + hfi1_schedule_tid_send(qp); } done: qpriv->s_retry = qp->s_retry_cnt; @@ -4584,6 +4591,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) qpriv->s_tid_tail = qp->s_acked; qpriv->s_state = TID_OP(WRITE_REQ); qpriv->s_retry = qp->s_retry_cnt; + hfi1_schedule_tid_send(qp); break; default: @@ -4673,6 +4681,7 @@ static void hfi1_tid_retry_timeout(struct timer_list *t) qp->s_flags |= HFI1_S_WAIT_HALT; priv->s_state = TID_OP(RESYNC); priv->s_retry--; + hfi1_schedule_tid_send(qp); } } spin_unlock(&qp->s_lock); @@ -4804,6 +4813,7 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) /* RESYNC request always gets a TID RDMA ACK. */ qpriv->s_nak_state = 0; qpriv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); bail: spin_unlock_irqrestore(&qp->s_lock, flags); } @@ -5155,3 +5165,134 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, qpriv->s_flags &= ~RVT_S_ACK_PENDING; return 0; } + +static int hfi1_send_tid_ok(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + return !(priv->s_flags & RVT_S_BUSY || + qp->s_flags & HFI1_S_ANY_WAIT_IO) && + (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) || + (priv->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); +} + +void _hfi1_do_tid_send(struct work_struct *work) +{ + struct iowait_work *w = container_of(work, struct iowait_work, iowork); + struct rvt_qp *qp = iowait_to_qp(w->iow); + + hfi1_do_tid_send(qp); +} + +static void hfi1_do_tid_send(struct rvt_qp *qp) +{ + struct hfi1_pkt_state ps; + struct hfi1_qp_priv *priv = qp->priv; + + ps.dev = to_idev(qp->ibqp.device); + ps.ibp = to_iport(qp->ibqp.device, qp->port_num); + ps.ppd = ppd_from_ibp(ps.ibp); + ps.wait = iowait_get_tid_work(&priv->s_iowait); + ps.in_thread = false; + ps.timeout_int = qp->timeout_jiffies / 8; + + spin_lock_irqsave(&qp->s_lock, ps.flags); + + /* Return if we are already busy processing a work request. */ + if (!hfi1_send_tid_ok(qp)) { + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + return; + } + + priv->s_flags |= RVT_S_BUSY; + + ps.timeout = jiffies + ps.timeout_int; + ps.cpu = priv->s_sde ? priv->s_sde->cpu : + cpumask_first(cpumask_of_node(ps.ppd->dd->node)); + ps.pkts_sent = false; + + /* insure a pre-built packet is handled */ + ps.s_txreq = get_waiting_verbs_txreq(ps.wait); + do { + /* Check for a constructed packet to be sent. */ + if (ps.s_txreq) { + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + qp->s_flags |= RVT_S_BUSY; + ps.wait = iowait_get_ib_work(&priv->s_iowait); + } + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + + /* + * If the packet cannot be sent now, return and + * the send tasklet will be woken up later. + */ + if (hfi1_verbs_send(qp, &ps)) + return; + + /* allow other tasks to run */ + if (hfi1_schedule_send_yield(qp, &ps, true)) + return; + + spin_lock_irqsave(&qp->s_lock, ps.flags); + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + qp->s_flags &= ~RVT_S_BUSY; + priv->s_flags &= ~HFI1_S_TID_BUSY_SET; + ps.wait = iowait_get_tid_work(&priv->s_iowait); + if (iowait_flag_set(&priv->s_iowait, + IOWAIT_PENDING_IB)) + hfi1_schedule_send(qp); + } + } + } while (hfi1_make_tid_rdma_pkt(qp, &ps)); + iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); +} + +static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + + return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); +} + +/** + * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine + * @qp: the QP + * + * This schedules qp progress on the TID RDMA state machine. Caller + * should hold the s_lock. + * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because + * the two state machines can step on each other with respect to the + * RVT_S_BUSY flag. + * Therefore, a modified test is used. + * @return true if the second leg is scheduled; + * false if the second leg is not scheduled. + */ +bool hfi1_schedule_tid_send(struct rvt_qp *qp) +{ + lockdep_assert_held(&qp->s_lock); + if (hfi1_send_tid_ok(qp)) { + /* + * The following call returns true if the qp is not on the + * queue and false if the qp is already on the queue before + * this call. Either way, the qp will be on the queue when the + * call returns. + */ + _hfi1_schedule_tid_send(qp); + return true; + } + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, + IOWAIT_PENDING_TID); + return false; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 0ce0ef6d60f24..7f8f17ba6c141 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -305,4 +305,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet); struct hfi1_pkt_state; int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps); +void _hfi1_do_tid_send(struct work_struct *work); + +bool hfi1_schedule_tid_send(struct rvt_qp *qp); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 3e45149d22c40..bee3d21a548ef 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -443,6 +443,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps); +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + bool tid); + void _hfi1_do_send(struct work_struct *work); void hfi1_do_send_from_rvt(struct rvt_qp *qp);