Skip to content

Commit

Permalink
IPoIB/cm: Use common CQ for CM send completions
Browse files Browse the repository at this point in the history
Use the same CQ for CM send completions as for all other IPoIB
completions.  This means all completions are processed via the same
NAPI polling routine.  This should help reduce the number of
interrupts for bi-directional traffic (such as TCP) and fixes "driver
is hogging interrupts" errors reported for IPoIB send side, e.g.
<https://bugs.openfabrics.org/show_bug.cgi?id=508>

To do this, keep a per-interface counter of outstanding send WRs, and
stop the interface when this counter reaches the send queue size to
avoid CQ overruns.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
  • Loading branch information
Michael S. Tsirkin authored and Roland Dreier committed Oct 20, 2007
1 parent cbfb50e commit 1b52496
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 95 deletions.
15 changes: 9 additions & 6 deletions drivers/infiniband/ulp/ipoib/ipoib.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,8 @@ enum {
IPOIB_MCAST_RUN = 6,
IPOIB_STOP_REAPER = 7,
IPOIB_MCAST_STARTED = 8,
IPOIB_FLAG_NETIF_STOPPED = 9,
IPOIB_FLAG_ADMIN_CM = 10,
IPOIB_FLAG_UMCAST = 11,
IPOIB_FLAG_ADMIN_CM = 9,
IPOIB_FLAG_UMCAST = 10,

IPOIB_MAX_BACKOFF_SECONDS = 16,

Expand All @@ -98,9 +97,9 @@ enum {

#define IPOIB_OP_RECV (1ul << 31)
#ifdef CONFIG_INFINIBAND_IPOIB_CM
#define IPOIB_CM_OP_SRQ (1ul << 30)
#define IPOIB_OP_CM (1ul << 30)
#else
#define IPOIB_CM_OP_SRQ (0)
#define IPOIB_OP_CM (0)
#endif

/* structs */
Expand Down Expand Up @@ -197,7 +196,6 @@ struct ipoib_cm_rx {

struct ipoib_cm_tx {
struct ib_cm_id *id;
struct ib_cq *cq;
struct ib_qp *qp;
struct list_head list;
struct net_device *dev;
Expand Down Expand Up @@ -294,6 +292,7 @@ struct ipoib_dev_priv {
unsigned tx_tail;
struct ib_sge tx_sge;
struct ib_send_wr tx_wr;
unsigned tx_outstanding;

struct ib_wc ibwc[IPOIB_NUM_WC];

Expand Down Expand Up @@ -502,6 +501,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
unsigned int mtu);
void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
#else

struct ipoib_cm_tx;
Expand Down Expand Up @@ -590,6 +590,9 @@ static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *w
{
}

static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
{
}
#endif

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
Expand Down
112 changes: 45 additions & 67 deletions drivers/infiniband/ulp/ipoib/ipoib_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id)
struct ib_recv_wr *bad_wr;
int i, ret;

priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;

for (i = 0; i < IPOIB_CM_RX_SG; ++i)
priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
Expand Down Expand Up @@ -401,7 +401,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
struct sk_buff *skb, *newskb;
struct ipoib_cm_rx *p;
unsigned long flags;
Expand All @@ -412,7 +412,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
wr_id, wc->status);

if (unlikely(wr_id >= ipoib_recvq_size)) {
if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
spin_lock_irqsave(&priv->lock, flags);
list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
ipoib_cm_start_rx_drain(priv);
Expand Down Expand Up @@ -498,7 +498,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,
priv->tx_sge.addr = addr;
priv->tx_sge.length = len;

priv->tx_wr.wr_id = wr_id;
priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;

return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
}
Expand Down Expand Up @@ -549,20 +549,19 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
dev->trans_start = jiffies;
++tx->tx_head;

if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) {
if (++priv->tx_outstanding == ipoib_sendq_size) {
ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
tx->qp->qp_num);
netif_stop_queue(dev);
set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
}
}
}

static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx,
struct ib_wc *wc)
void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
unsigned int wr_id = wc->wr_id;
struct ipoib_cm_tx *tx = wc->qp->qp_context;
unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
struct ipoib_tx_buf *tx_req;
unsigned long flags;

Expand All @@ -587,11 +586,10 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx

spin_lock_irqsave(&priv->tx_lock, flags);
++tx->tx_tail;
if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) &&
tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) {
clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
netif_queue_stopped(dev) &&
test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
netif_wake_queue(dev);
}

if (wc->status != IB_WC_SUCCESS &&
wc->status != IB_WC_WR_FLUSH_ERR) {
Expand All @@ -614,11 +612,6 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx
tx->neigh = NULL;
}

/* queue would be re-started anyway when TX is destroyed,
* but it makes sense to do it ASAP here. */
if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags))
netif_wake_queue(dev);

if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
list_move(&tx->list, &priv->cm.reap_list);
queue_work(ipoib_workqueue, &priv->cm.reap_task);
Expand All @@ -632,19 +625,6 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx
spin_unlock_irqrestore(&priv->tx_lock, flags);
}

static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
{
struct ipoib_cm_tx *tx = tx_ptr;
int n, i;

ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
do {
n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc);
for (i = 0; i < n; ++i)
ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i);
} while (n == IPOIB_NUM_WC);
}

int ipoib_cm_dev_open(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
Expand Down Expand Up @@ -807,17 +787,18 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
return 0;
}

static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq)
static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_qp_init_attr attr = {
.send_cq = cq,
.send_cq = priv->cq,
.recv_cq = priv->cq,
.srq = priv->cm.srq,
.cap.max_send_wr = ipoib_sendq_size,
.cap.max_send_sge = 1,
.sq_sig_type = IB_SIGNAL_ALL_WR,
.qp_type = IB_QPT_RC,
.qp_context = tx
};

return ib_create_qp(priv->pd, &attr);
Expand Down Expand Up @@ -899,21 +880,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
goto err_tx;
}

p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p,
ipoib_sendq_size + 1, 0);
if (IS_ERR(p->cq)) {
ret = PTR_ERR(p->cq);
ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret);
goto err_cq;
}

ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP);
if (ret) {
ipoib_warn(priv, "failed to request completion notification: %d\n", ret);
goto err_req_notify;
}

p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq);
p->qp = ipoib_cm_create_tx_qp(p->dev, p);
if (IS_ERR(p->qp)) {
ret = PTR_ERR(p->qp);
ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
Expand Down Expand Up @@ -950,12 +917,8 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
err_id:
p->id = NULL;
ib_destroy_qp(p->qp);
err_req_notify:
err_qp:
p->qp = NULL;
ib_destroy_cq(p->cq);
err_cq:
p->cq = NULL;
err_tx:
return ret;
}
Expand All @@ -964,34 +927,49 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
{
struct ipoib_dev_priv *priv = netdev_priv(p->dev);
struct ipoib_tx_buf *tx_req;
unsigned long flags;
unsigned long begin;

ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);

if (p->id)
ib_destroy_cm_id(p->id);

if (p->qp)
ib_destroy_qp(p->qp);

if (p->cq)
ib_destroy_cq(p->cq);

if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags))
netif_wake_queue(p->dev);

if (p->tx_ring) {
/* Wait for all sends to complete */
begin = jiffies;
while ((int) p->tx_tail - (int) p->tx_head < 0) {
tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
DMA_TO_DEVICE);
dev_kfree_skb_any(tx_req->skb);
++p->tx_tail;
if (time_after(jiffies, begin + 5 * HZ)) {
ipoib_warn(priv, "timing out; %d sends not completed\n",
p->tx_head - p->tx_tail);
goto timeout;
}

msleep(1);
}
}

kfree(p->tx_ring);
timeout:

while ((int) p->tx_tail - (int) p->tx_head < 0) {
tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
DMA_TO_DEVICE);
dev_kfree_skb_any(tx_req->skb);
++p->tx_tail;
spin_lock_irqsave(&priv->tx_lock, flags);
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
netif_queue_stopped(p->dev) &&
test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
netif_wake_queue(p->dev);
spin_unlock_irqrestore(&priv->tx_lock, flags);
}

if (p->qp)
ib_destroy_qp(p->qp);

kfree(p->tx_ring);
kfree(p);
}

Expand Down
46 changes: 27 additions & 19 deletions drivers/infiniband/ulp/ipoib/ipoib_ib.c
Original file line number Diff line number Diff line change
Expand Up @@ -267,11 +267,10 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)

spin_lock_irqsave(&priv->tx_lock, flags);
++priv->tx_tail;
if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
netif_queue_stopped(dev) &&
test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
netif_wake_queue(dev);
}
spin_unlock_irqrestore(&priv->tx_lock, flags);

if (wc->status != IB_WC_SUCCESS &&
Expand Down Expand Up @@ -301,14 +300,18 @@ int ipoib_poll(struct napi_struct *napi, int budget)
for (i = 0; i < n; i++) {
struct ib_wc *wc = priv->ibwc + i;

if (wc->wr_id & IPOIB_CM_OP_SRQ) {
++done;
ipoib_cm_handle_rx_wc(dev, wc);
} else if (wc->wr_id & IPOIB_OP_RECV) {
if (wc->wr_id & IPOIB_OP_RECV) {
++done;
ipoib_ib_handle_rx_wc(dev, wc);
} else
ipoib_ib_handle_tx_wc(dev, wc);
if (wc->wr_id & IPOIB_OP_CM)
ipoib_cm_handle_rx_wc(dev, wc);
else
ipoib_ib_handle_rx_wc(dev, wc);
} else {
if (wc->wr_id & IPOIB_OP_CM)
ipoib_cm_handle_tx_wc(dev, wc);
else
ipoib_ib_handle_tx_wc(dev, wc);
}
}

if (n != t)
Expand Down Expand Up @@ -401,10 +404,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
address->last_send = priv->tx_head;
++priv->tx_head;

if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
if (++priv->tx_outstanding == ipoib_sendq_size) {
ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
netif_stop_queue(dev);
set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
}
}
}
Expand Down Expand Up @@ -563,12 +565,17 @@ void ipoib_drain_cq(struct net_device *dev)
if (priv->ibwc[i].status == IB_WC_SUCCESS)
priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;

if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ)
ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV)
ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
else
ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
else
ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
} else {
if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
else
ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
}
}
} while (n == IPOIB_NUM_WC);
}
Expand Down Expand Up @@ -614,6 +621,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
DMA_TO_DEVICE);
dev_kfree_skb_any(tx_req->skb);
++priv->tx_tail;
--priv->tx_outstanding;
}

for (i = 0; i < ipoib_recvq_size; ++i) {
Expand Down
4 changes: 1 addition & 3 deletions drivers/infiniband/ulp/ipoib/ipoib_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,6 @@ static int ipoib_stop(struct net_device *dev)

netif_stop_queue(dev);

clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);

/*
* Now flush workqueue to make sure a scheduled task doesn't
* bring our internal state back up.
Expand Down Expand Up @@ -895,7 +893,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
goto out_rx_ring_cleanup;
}

/* priv->tx_head & tx_tail are already 0 */
/* priv->tx_head, tx_tail & tx_outstanding are already 0 */

if (ipoib_ib_dev_init(dev, ca, port))
goto out_tx_ring_cleanup;
Expand Down

0 comments on commit 1b52496

Please sign in to comment.