Skip to content

Commit

Permalink
IPoIB: Handle 4K IB MTU for UD (datagram) mode
Browse files Browse the repository at this point in the history
This patch enables IPoIB to use 4K UD messages (when the underlying
device and fabrics support a 4K MTU) by using two scatter buffers when
PAGE_SIZE is less than or equal to thhe HCA IB MTU size.  The first
buffer is for IPoIB header + GRH header, and the second buffer is the
IPoIB payload, which is 4K-4.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
  • Loading branch information
Shirley Ma authored and Roland Dreier committed Apr 23, 2008
1 parent bc5698f commit bc7b3a3
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 49 deletions.
20 changes: 16 additions & 4 deletions drivers/infiniband/ulp/ipoib/ipoib.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@
/* constants */

enum {
IPOIB_PACKET_SIZE = 2048,
IPOIB_BUF_SIZE = IPOIB_PACKET_SIZE + IB_GRH_BYTES,

IPOIB_ENCAP_LEN = 4,

IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */

IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */
IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN,
IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
Expand Down Expand Up @@ -139,7 +139,7 @@ struct ipoib_mcast {

struct ipoib_rx_buf {
struct sk_buff *skb;
u64 mapping;
u64 mapping[IPOIB_UD_RX_SG];
};

struct ipoib_tx_buf {
Expand Down Expand Up @@ -294,6 +294,7 @@ struct ipoib_dev_priv {

unsigned int admin_mtu;
unsigned int mcast_mtu;
unsigned int max_ib_mtu;

struct ipoib_rx_buf *rx_ring;

Expand All @@ -305,6 +306,9 @@ struct ipoib_dev_priv {
struct ib_send_wr tx_wr;
unsigned tx_outstanding;

struct ib_recv_wr rx_wr;
struct ib_sge rx_sge[IPOIB_UD_RX_SG];

struct ib_wc ibwc[IPOIB_NUM_WC];

struct list_head dead_ahs;
Expand Down Expand Up @@ -366,6 +370,14 @@ struct ipoib_neigh {
struct list_head list;
};

#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN)
#define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES)

static inline int ipoib_ud_need_sg(unsigned int ib_mtu)
{
return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;
}

/*
* We stash a pointer to our private neighbour information after our
* hardware address in neigh->ha. The ALIGN() expression here makes
Expand Down
125 changes: 88 additions & 37 deletions drivers/infiniband/ulp/ipoib/ipoib_ib.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,44 +89,81 @@ void ipoib_free_ah(struct kref *kref)
spin_unlock_irqrestore(&priv->lock, flags);
}

static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
u64 mapping[IPOIB_UD_RX_SG])
{
if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE,
DMA_FROM_DEVICE);
ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE,
DMA_FROM_DEVICE);
} else
ib_dma_unmap_single(priv->ca, mapping[0],
IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
DMA_FROM_DEVICE);
}

static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
struct sk_buff *skb,
unsigned int length)
{
if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
unsigned int size;
/*
* There is only two buffers needed for max_payload = 4K,
* first buf size is IPOIB_UD_HEAD_SIZE
*/
skb->tail += IPOIB_UD_HEAD_SIZE;
skb->len += length;

size = length - IPOIB_UD_HEAD_SIZE;

frag->size = size;
skb->data_len += size;
skb->truesize += size;
} else
skb_put(skb, length);

}

static int ipoib_ib_post_receive(struct net_device *dev, int id)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_sge list;
struct ib_recv_wr param;
struct ib_recv_wr *bad_wr;
int ret;

list.addr = priv->rx_ring[id].mapping;
list.length = IPOIB_BUF_SIZE;
list.lkey = priv->mr->lkey;
priv->rx_wr.wr_id = id | IPOIB_OP_RECV;
priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];

param.next = NULL;
param.wr_id = id | IPOIB_OP_RECV;
param.sg_list = &list;
param.num_sge = 1;

ret = ib_post_recv(priv->qp, &param, &bad_wr);
ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
if (unlikely(ret)) {
ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
dev_kfree_skb_any(priv->rx_ring[id].skb);
priv->rx_ring[id].skb = NULL;
}

return ret;
}

static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct sk_buff *skb;
u64 addr;
int buf_size;
u64 *mapping;

skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
if (!skb)
return -ENOMEM;
if (ipoib_ud_need_sg(priv->max_ib_mtu))
buf_size = IPOIB_UD_HEAD_SIZE;
else
buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);

skb = dev_alloc_skb(buf_size + 4);
if (unlikely(!skb))
return NULL;

/*
* IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
Expand All @@ -135,17 +172,32 @@ static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
*/
skb_reserve(skb, 4);

addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
DMA_FROM_DEVICE);
if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
dev_kfree_skb_any(skb);
return -EIO;
mapping = priv->rx_ring[id].mapping;
mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
DMA_FROM_DEVICE);
if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
goto error;

if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
struct page *page = alloc_page(GFP_ATOMIC);
if (!page)
goto partial_error;
skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
mapping[1] =
ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page,
0, PAGE_SIZE, DMA_FROM_DEVICE);
if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
goto partial_error;
}

priv->rx_ring[id].skb = skb;
priv->rx_ring[id].mapping = addr;
priv->rx_ring[id].skb = skb;
return skb;

return 0;
partial_error:
ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
error:
dev_kfree_skb_any(skb);
return NULL;
}

static int ipoib_ib_post_receives(struct net_device *dev)
Expand All @@ -154,7 +206,7 @@ static int ipoib_ib_post_receives(struct net_device *dev)
int i;

for (i = 0; i < ipoib_recvq_size; ++i) {
if (ipoib_alloc_rx_skb(dev, i)) {
if (!ipoib_alloc_rx_skb(dev, i)) {
ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
return -ENOMEM;
}
Expand All @@ -172,7 +224,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
struct ipoib_dev_priv *priv = netdev_priv(dev);
unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
struct sk_buff *skb;
u64 addr;
u64 mapping[IPOIB_UD_RX_SG];

ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
wr_id, wc->status);
Expand All @@ -184,15 +236,13 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
}

skb = priv->rx_ring[wr_id].skb;
addr = priv->rx_ring[wr_id].mapping;

if (unlikely(wc->status != IB_WC_SUCCESS)) {
if (wc->status != IB_WC_WR_FLUSH_ERR)
ipoib_warn(priv, "failed recv event "
"(status=%d, wrid=%d vend_err %x)\n",
wc->status, wr_id, wc->vendor_err);
ib_dma_unmap_single(priv->ca, addr,
IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
dev_kfree_skb_any(skb);
priv->rx_ring[wr_id].skb = NULL;
return;
Expand All @@ -205,21 +255,24 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
goto repost;

memcpy(mapping, priv->rx_ring[wr_id].mapping,
IPOIB_UD_RX_SG * sizeof *mapping);

/*
* If we can't allocate a new RX buffer, dump
* this packet and reuse the old buffer.
*/
if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
++dev->stats.rx_dropped;
goto repost;
}

ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
wc->byte_len, wc->slid);

ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
ipoib_ud_dma_unmap_rx(priv, mapping);
ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);

skb_put(skb, wc->byte_len);
skb_pull(skb, IB_GRH_BYTES);

skb->protocol = ((struct ipoib_header *) skb->data)->proto;
Expand Down Expand Up @@ -733,10 +786,8 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
rx_req = &priv->rx_ring[i];
if (!rx_req->skb)
continue;
ib_dma_unmap_single(priv->ca,
rx_req->mapping,
IPOIB_BUF_SIZE,
DMA_FROM_DEVICE);
ipoib_ud_dma_unmap_rx(priv,
priv->rx_ring[i].mapping);
dev_kfree_skb_any(rx_req->skb);
rx_req->skb = NULL;
}
Expand Down
19 changes: 14 additions & 5 deletions drivers/infiniband/ulp/ipoib/ipoib_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}

if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
return -EINVAL;

priv->admin_mtu = new_mtu;
Expand Down Expand Up @@ -971,10 +971,6 @@ static void ipoib_setup(struct net_device *dev)
NETIF_F_LLTX |
NETIF_F_HIGHDMA);

/* MTU will be reset when mcast join happens */
dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
priv->mcast_mtu = priv->admin_mtu = dev->mtu;

memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);

netif_carrier_off(dev);
Expand Down Expand Up @@ -1107,6 +1103,7 @@ static struct net_device *ipoib_add_port(const char *format,
{
struct ipoib_dev_priv *priv;
struct ib_device_attr *device_attr;
struct ib_port_attr attr;
int result = -ENOMEM;

priv = ipoib_intf_alloc(format);
Expand All @@ -1115,6 +1112,18 @@ static struct net_device *ipoib_add_port(const char *format,

SET_NETDEV_DEV(priv->dev, hca->dma_device);

if (!ib_query_port(hca, port, &attr))
priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
else {
printk(KERN_WARNING "%s: ib_query_port %d failed\n",
hca->name, port);
goto device_init_failed;
}

/* MTU will be reset when mcast join happens */
priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu;

result = ib_query_pkey(hca, port, 0, &priv->pkey);
if (result) {
printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
Expand Down
3 changes: 1 addition & 2 deletions drivers/infiniband/ulp/ipoib/ipoib_multicast.c
Original file line number Diff line number Diff line change
Expand Up @@ -567,8 +567,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
return;
}

priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
IPOIB_ENCAP_LEN;
priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));

if (!ipoib_cm_admin_enabled(dev))
dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
Expand Down
15 changes: 14 additions & 1 deletion drivers/infiniband/ulp/ipoib/ipoib_verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
.max_send_wr = ipoib_sendq_size,
.max_recv_wr = ipoib_recvq_size,
.max_send_sge = 1,
.max_recv_sge = 1
.max_recv_sge = IPOIB_UD_RX_SG
},
.sq_sig_type = IB_SIGNAL_ALL_WR,
.qp_type = IB_QPT_UD
Expand Down Expand Up @@ -215,6 +215,19 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
priv->tx_wr.sg_list = priv->tx_sge;
priv->tx_wr.send_flags = IB_SEND_SIGNALED;

priv->rx_sge[0].lkey = priv->mr->lkey;
if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE;
priv->rx_sge[1].length = PAGE_SIZE;
priv->rx_sge[1].lkey = priv->mr->lkey;
priv->rx_wr.num_sge = IPOIB_UD_RX_SG;
} else {
priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
priv->rx_wr.num_sge = 1;
}
priv->rx_wr.next = NULL;
priv->rx_wr.sg_list = priv->rx_sge;

return 0;

out_free_cq:
Expand Down
1 change: 1 addition & 0 deletions drivers/infiniband/ulp/ipoib/ipoib_vlan.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
goto err;
}

priv->max_ib_mtu = ppriv->max_ib_mtu;
set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);

priv->pkey = pkey;
Expand Down

0 comments on commit bc7b3a3

Please sign in to comment.