From 7235aa79f683db0d908dcb0c2b7062dfdd765196 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Nov 2012 15:47:40 +0000 Subject: [PATCH 01/33] UAPI: (Scripted) Disintegrate include/rdma Signed-off-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Michael Kerrisk Acked-by: Paul E. McKenney Acked-by: Dave Jones Signed-off-by: Roland Dreier --- include/rdma/Kbuild | 6 ---- include/rdma/rdma_netlink.h | 36 +----------------------- include/uapi/rdma/Kbuild | 6 ++++ include/{ => uapi}/rdma/ib_user_cm.h | 0 include/{ => uapi}/rdma/ib_user_mad.h | 0 include/{ => uapi}/rdma/ib_user_sa.h | 0 include/{ => uapi}/rdma/ib_user_verbs.h | 0 include/uapi/rdma/rdma_netlink.h | 37 +++++++++++++++++++++++++ include/{ => uapi}/rdma/rdma_user_cm.h | 0 9 files changed, 44 insertions(+), 41 deletions(-) rename include/{ => uapi}/rdma/ib_user_cm.h (100%) rename include/{ => uapi}/rdma/ib_user_mad.h (100%) rename include/{ => uapi}/rdma/ib_user_sa.h (100%) rename include/{ => uapi}/rdma/ib_user_verbs.h (100%) create mode 100644 include/uapi/rdma/rdma_netlink.h rename include/{ => uapi}/rdma/rdma_user_cm.h (100%) diff --git a/include/rdma/Kbuild b/include/rdma/Kbuild index ea56f76c0c220..e69de29bb2d1d 100644 --- a/include/rdma/Kbuild +++ b/include/rdma/Kbuild @@ -1,6 +0,0 @@ -header-y += ib_user_cm.h -header-y += ib_user_mad.h -header-y += ib_user_sa.h -header-y += ib_user_verbs.h -header-y += rdma_netlink.h -header-y += rdma_user_cm.h diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index bd3d8b24b4202..e38de79eeb48f 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -1,41 +1,9 @@ #ifndef _RDMA_NETLINK_H #define _RDMA_NETLINK_H -#include - -enum { - RDMA_NL_RDMA_CM = 1 -}; - -#define RDMA_NL_GET_CLIENT(type) ((type & (((1 << 6) - 1) << 10)) >> 10) -#define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1)) -#define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op) - -enum { - RDMA_NL_RDMA_CM_ID_STATS = 0, - RDMA_NL_RDMA_CM_NUM_OPS -}; - -enum { - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1, - RDMA_NL_RDMA_CM_ATTR_DST_ADDR, - RDMA_NL_RDMA_CM_NUM_ATTR, -}; - -struct rdma_cm_id_stats { - __u32 qp_num; - __u32 bound_dev_if; - __u32 port_space; - __s32 pid; - __u8 cm_state; - __u8 node_type; - __u8 port_num; - __u8 qp_type; -}; - -#ifdef __KERNEL__ #include +#include struct ibnl_client_cbs { int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb); @@ -88,6 +56,4 @@ void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, int len, void *data, int type); -#endif /* __KERNEL__ */ - #endif /* _RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild index aafaa5aa54d46..687ae332200f9 100644 --- a/include/uapi/rdma/Kbuild +++ b/include/uapi/rdma/Kbuild @@ -1 +1,7 @@ # UAPI Header export list +header-y += ib_user_cm.h +header-y += ib_user_mad.h +header-y += ib_user_sa.h +header-y += ib_user_verbs.h +header-y += rdma_netlink.h +header-y += rdma_user_cm.h diff --git a/include/rdma/ib_user_cm.h b/include/uapi/rdma/ib_user_cm.h similarity index 100% rename from include/rdma/ib_user_cm.h rename to include/uapi/rdma/ib_user_cm.h diff --git a/include/rdma/ib_user_mad.h b/include/uapi/rdma/ib_user_mad.h similarity index 100% rename from include/rdma/ib_user_mad.h rename to include/uapi/rdma/ib_user_mad.h diff --git a/include/rdma/ib_user_sa.h b/include/uapi/rdma/ib_user_sa.h similarity index 100% rename from include/rdma/ib_user_sa.h rename to include/uapi/rdma/ib_user_sa.h diff --git a/include/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h similarity index 100% rename from include/rdma/ib_user_verbs.h rename to include/uapi/rdma/ib_user_verbs.h diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h new file mode 100644 index 0000000000000..8297285b62886 --- /dev/null +++ b/include/uapi/rdma/rdma_netlink.h @@ -0,0 +1,37 @@ +#ifndef _UAPI_RDMA_NETLINK_H +#define _UAPI_RDMA_NETLINK_H + +#include + +enum { + RDMA_NL_RDMA_CM = 1 +}; + +#define RDMA_NL_GET_CLIENT(type) ((type & (((1 << 6) - 1) << 10)) >> 10) +#define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1)) +#define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op) + +enum { + RDMA_NL_RDMA_CM_ID_STATS = 0, + RDMA_NL_RDMA_CM_NUM_OPS +}; + +enum { + RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1, + RDMA_NL_RDMA_CM_ATTR_DST_ADDR, + RDMA_NL_RDMA_CM_NUM_ATTR, +}; + +struct rdma_cm_id_stats { + __u32 qp_num; + __u32 bound_dev_if; + __u32 port_space; + __s32 pid; + __u8 cm_state; + __u8 node_type; + __u8 port_num; + __u8 qp_type; +}; + + +#endif /* _UAPI_RDMA_NETLINK_H */ diff --git a/include/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h similarity index 100% rename from include/rdma/rdma_user_cm.h rename to include/uapi/rdma/rdma_user_cm.h From 3127e4ea54fc023e35adb1d9af29d49c6d582d12 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Fri, 2 Nov 2012 23:17:34 +0000 Subject: [PATCH 02/33] RDMA/nes: Fix incorrect address of IP header Fix for incorrect ip header address when forwarding fpdus to hardware. Signed-off-by: Tatyana Nikolova Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_mgt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c index 3ba7be3694520..8cf74fd0c44fa 100644 --- a/drivers/infiniband/hw/nes/nes_mgt.c +++ b/drivers/infiniband/hw/nes/nes_mgt.c @@ -447,7 +447,7 @@ static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX, lower_32_bits(u64tmp)); set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX, - upper_32_bits(u64tmp >> 32)); + upper_32_bits(u64tmp)); set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, lower_32_bits(fpdu_info->frags[0].physaddr)); From bff3976bef4f917554292d48ce43636f3d040182 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Fri, 2 Nov 2012 23:17:45 +0000 Subject: [PATCH 03/33] RDMA/nes: Fix for unlinking skbs from empty list Signed-off-by: Tatyana Nikolova Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_mgt.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c index 8cf74fd0c44fa..1f5d69e7793a4 100644 --- a/drivers/infiniband/hw/nes/nes_mgt.c +++ b/drivers/infiniband/hw/nes/nes_mgt.c @@ -210,6 +210,9 @@ static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp } while (1) { + if (skb_queue_empty(&nesqp->pau_list)) + goto out; + seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd); if (seq == nextseq) { if (skb->len || processacks) @@ -218,14 +221,13 @@ static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp goto out; } - if (skb->next == (struct sk_buff *)&nesqp->pau_list) - goto out; - old_skb = skb; skb = skb->next; skb_unlink(old_skb, &nesqp->pau_list); nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE); nes_rem_ref_cm_node(nesqp->cm_node); + if (skb == (struct sk_buff *)&nesqp->pau_list) + goto out; } return skb; @@ -384,7 +386,8 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, if (frags[i].skb->len == 0) { /* Pull skb off the list - it will be freed in the callback */ spin_lock_irqsave(&nesqp->pau_lock, flags); - skb_unlink(frags[i].skb, &nesqp->pau_list); + if (!skb_queue_empty(&nesqp->pau_list)) + skb_unlink(frags[i].skb, &nesqp->pau_list); spin_unlock_irqrestore(&nesqp->pau_lock, flags); } else { /* Last skb still has data so update the seq */ From fc8d7547b1e19e1bb8f3206837ee0c7c6538e2e5 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Fri, 2 Nov 2012 23:17:54 +0000 Subject: [PATCH 04/33] RDMA/nes: Fix for sending fpdus in order to hardware Locking fix to prevent race conditions. Fpdus (per qp) need to be forwarded to hardware in the order of their sequence numbers. Signed-off-by: Tatyana Nikolova Signed-off-by: Donald Wood Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_mgt.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c index 1f5d69e7793a4..4d6d77fd18b6b 100644 --- a/drivers/infiniband/hw/nes/nes_mgt.c +++ b/drivers/infiniband/hw/nes/nes_mgt.c @@ -247,7 +247,6 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, struct nes_rskb_cb *cb; struct pau_fpdu_info *fpdu_info = NULL; struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; - unsigned long flags; u32 fpdu_len = 0; u32 tmp_len; int frag_cnt = 0; @@ -262,12 +261,10 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, *pau_fpdu_info = NULL; - spin_lock_irqsave(&nesqp->pau_lock, flags); skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd); - if (!skb) { - spin_unlock_irqrestore(&nesqp->pau_lock, flags); + if (!skb) goto out; - } + cb = (struct nes_rskb_cb *)&skb->cb[0]; if (skb->len) { fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING; @@ -292,10 +289,9 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, skb = nes_get_next_skb(nesdev, nesqp, skb, nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd); - if (!skb) { - spin_unlock_irqrestore(&nesqp->pau_lock, flags); + if (!skb) goto out; - } else if (rst_rcvd) { + if (rst_rcvd) { /* rst received in the middle of fpdu */ for (; i >= 0; i--) { skb_unlink(frags[i].skb, &nesqp->pau_list); @@ -322,8 +318,6 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, frag_cnt = 1; } - spin_unlock_irqrestore(&nesqp->pau_lock, flags); - /* Found one */ fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC); if (fpdu_info == NULL) { @@ -385,10 +379,8 @@ static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, if (frags[i].skb->len == 0) { /* Pull skb off the list - it will be freed in the callback */ - spin_lock_irqsave(&nesqp->pau_lock, flags); if (!skb_queue_empty(&nesqp->pau_list)) skb_unlink(frags[i].skb, &nesqp->pau_list); - spin_unlock_irqrestore(&nesqp->pau_lock, flags); } else { /* Last skb still has data so update the seq */ iph = (struct iphdr *)(cb->data_start + ETH_HLEN); @@ -417,14 +409,18 @@ static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) struct pau_fpdu_info *fpdu_info; struct nes_hw_cqp_wqe *cqp_wqe; struct nes_cqp_request *cqp_request; + unsigned long flags; u64 u64tmp; u32 u32tmp; int rc; while (1) { + spin_lock_irqsave(&nesqp->pau_lock, flags); rc = get_fpdu_info(nesdev, nesqp, &fpdu_info); - if (fpdu_info == NULL) + if (rc || (fpdu_info == NULL)) { + spin_unlock_irqrestore(&nesqp->pau_lock, flags); return rc; + } cqp_request = fpdu_info->cqp_request; cqp_wqe = &cqp_request->cqp_wqe; @@ -478,6 +474,7 @@ static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) atomic_set(&cqp_request->refcount, 1); nes_post_cqp_request(nesdev, cqp_request); + spin_unlock_irqrestore(&nesqp->pau_lock, flags); } return 0; From cecdcd5f24be8c532ad8dcbbd93c7b477cfd3413 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Tue, 13 Nov 2012 22:20:41 +0000 Subject: [PATCH 05/33] RDMA/nes: Fix for incorrect multicast address in the perfect filter table Signed-off-by: Tatyana Nikolova Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_nic.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 0564be757d827..9542e1644a5c1 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -944,12 +944,13 @@ static void nes_netdev_set_multicast_list(struct net_device *netdev) addr, perfect_filter_register_address+(mc_index * 8), mc_nic_index); - macaddr_high = ((u16) addr[0]) << 8; - macaddr_high += (u16) addr[1]; - macaddr_low = ((u32) addr[2]) << 24; - macaddr_low += ((u32) addr[3]) << 16; - macaddr_low += ((u32) addr[4]) << 8; - macaddr_low += (u32) addr[5]; + macaddr_high = ((u8) addr[0]) << 8; + macaddr_high += (u8) addr[1]; + macaddr_low = ((u8) addr[2]) << 24; + macaddr_low += ((u8) addr[3]) << 16; + macaddr_low += ((u8) addr[4]) << 8; + macaddr_low += (u8) addr[5]; + nes_write_indexed(nesdev, perfect_filter_register_address+(mc_index * 8), macaddr_low); From 079abea6a37fd3b4f7e1b7cf9e4d055463988753 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 10:58:37 +0000 Subject: [PATCH 06/33] RDMA/nes: Use WARN() Use WARN() rather than printk() followed by WARN_ON(1), for conciseness. A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression list es; @@ -printk( +WARN(1, es); -WARN_ON(1); // Signed-off-by: Julia Lawall [ Remove extra KERN_ERR from WARN() format. - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 8 +++----- drivers/infiniband/hw/nes/nes_mgt.c | 6 ++---- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index cfaacaf6bf5f9..feb41e74206b2 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -629,11 +629,9 @@ static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_a case SEND_RDMA_READ_ZERO: default: - if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO) { - printk(KERN_ERR "%s[%u]: Unsupported RDMA0 len operation=%u\n", - __func__, __LINE__, cm_node->send_rdma0_op); - WARN_ON(1); - } + if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO) + WARN(1, "Unsupported RDMA0 len operation=%u\n", + cm_node->send_rdma0_op); nes_debug(NES_DBG_CM, "Sending first rdma operation.\n"); wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(NES_IWARP_SQ_OP_RDMAR); diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c index 4d6d77fd18b6b..416645259b0f8 100644 --- a/drivers/infiniband/hw/nes/nes_mgt.c +++ b/drivers/infiniband/hw/nes/nes_mgt.c @@ -649,11 +649,9 @@ static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request nesqp = qh_chg->nesqp; /* Should we handle the bad completion */ - if (cqp_request->major_code) { - printk(KERN_ERR PFX "Invalid cqp_request major_code=0x%x\n", + if (cqp_request->major_code) + WARN(1, PFX "Invalid cqp_request major_code=0x%x\n", cqp_request->major_code); - WARN_ON(1); - } switch (nesqp->pau_state) { case PAU_DEL_QH: From 5390f86796a1f444ca1a7ba7315951e26acd958d Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 25 Oct 2012 14:27:07 +0000 Subject: [PATCH 07/33] IB/ipath: Remove unreachable code Signed-off-by: Alan Cox Acked-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_init_chip.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index 49b09c697c7c8..be2a60e142b00 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -718,16 +718,6 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) if (ret) goto done; - /* - * we ignore most issues after reporting them, but have to specially - * handle hardware-disabled chips. - */ - if (ret == 2) { - /* unique error, known to ipath_init_one */ - ret = -EPERM; - goto done; - } - /* * We could bump this to allow for full rcvegrcnt + rcvtidcnt, * but then it no longer nicely fits power of two, and since From c9795bd708e37ed4154e838a1f4576192eeeacca Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 25 Oct 2012 14:36:51 +0000 Subject: [PATCH 08/33] RDMA/amsol1100: Fix missing break Signed-off-by: Alan Cox Signed-off-by: Roland Dreier --- drivers/infiniband/hw/amso1100/c2_ae.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c index 32d34e88d5cf7..706cf97cbe8f4 100644 --- a/drivers/infiniband/hw/amso1100/c2_ae.c +++ b/drivers/infiniband/hw/amso1100/c2_ae.c @@ -311,6 +311,7 @@ void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) if (cq->ibcq.event_handler) cq->ibcq.event_handler(&ib_event, cq->ibcq.cq_context); + break; } default: From 08ff32352d6ff7083533dc1c25618d42f92ec28e Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 21 Oct 2012 14:59:24 +0000 Subject: [PATCH 09/33] mlx4: 64-byte CQE/EQE support ConnectX-3 devices can use either 64- or 32-byte completion queue entries (CQEs) and event queue entries (EQEs). Using 64-byte EQEs/CQEs performs better because each entry is aligned to a complete cacheline. This patch queries the HCA's capabilities, and if it supports 64-byte CQEs and EQES the driver will configure the HW to work in 64-byte mode. The 32-byte vs 64-byte mode is global per HCA and not per CQ or EQ. Since this mode is global, userspace (libmlx4) must be updated to work with the configured CQE size, and guests using SR-IOV virtual functions need to know both EQE and CQE size. In case one of the 64-byte CQE/EQE capabilities is activated, the patch makes sure that older guest drivers that use the QUERY_DEV_FUNC command (e.g as done in mlx4_core of Linux 3.3..3.6) will notice that they need an update to be able to work with the PPF. This is done by changing the returned pf_context_behaviour not to be zero any more. In case none of these capabilities is activated that value remains zero and older guest drivers can run OK. The SRIOV related flow is as follows 1. the PPF does the detection of the new capabilities using QUERY_DEV_CAP command. 2. the PPF activates the new capabilities using INIT_HCA. 3. the VF detects if the PPF activated the capabilities using QUERY_HCA, and if this is the case activates them for itself too. Note that the VF detects that it must be aware to the new PF behaviour using QUERY_FUNC_CAP. Steps 1 and 2 apply also for native mode. User space notification is done through a new field introduced in struct mlx4_ib_ucontext which holds device capabilities for which user space must take action. This changes the binary interface so the ABI towards libmlx4 exposed through uverbs is bumped from 3 to 4 but only when **needed** i.e. only when the driver does use 64-byte CQEs or future device capabilities which must be in sync by user space. This practice allows to work with unmodified libmlx4 on older devices (e.g A0, B0) which don't support 64-byte CQEs. In order to keep existing systems functional when they update to a newer kernel that contains these changes in VF and userspace ABI, a module parameter enable_64b_cqe_eqe must be set to enable 64-byte mode; the default is currently false. Signed-off-by: Eli Cohen Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/cq.c | 34 +++++++++++++---- drivers/infiniband/hw/mlx4/main.c | 27 ++++++++++--- drivers/infiniband/hw/mlx4/mlx4_ib.h | 1 + drivers/infiniband/hw/mlx4/user.h | 12 +++++- drivers/net/ethernet/mellanox/mlx4/cmd.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 2 +- .../net/ethernet/mellanox/mlx4/en_netdev.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_rx.c | 5 ++- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 5 ++- drivers/net/ethernet/mellanox/mlx4/eq.c | 26 ++++++++----- drivers/net/ethernet/mellanox/mlx4/fw.c | 30 ++++++++++++++- drivers/net/ethernet/mellanox/mlx4/fw.h | 1 + drivers/net/ethernet/mellanox/mlx4/main.c | 38 ++++++++++++++++++- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 + include/linux/mlx4/device.h | 21 ++++++++++ 15 files changed, 175 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index c9eb6a6815ce2..ae67df35dd4d6 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -66,7 +66,7 @@ static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) { - return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe)); + return mlx4_buf_offset(&buf->buf, n * buf->entry_size); } static void *get_cqe(struct mlx4_ib_cq *cq, int n) @@ -77,8 +77,9 @@ static void *get_cqe(struct mlx4_ib_cq *cq, int n) static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) { struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe); - return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; } @@ -99,12 +100,13 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * { int err; - err = mlx4_buf_alloc(dev->dev, nent * sizeof(struct mlx4_cqe), + err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, PAGE_SIZE * 2, &buf->buf); if (err) goto out; + buf->entry_size = dev->dev->caps.cqe_size; err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, &buf->mtt); if (err) @@ -120,8 +122,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * mlx4_mtt_cleanup(dev->dev, &buf->mtt); err_buf: - mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe), - &buf->buf); + mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf); out: return err; @@ -129,7 +130,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe) { - mlx4_buf_free(dev->dev, (cqe + 1) * sizeof(struct mlx4_cqe), &buf->buf); + mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf); } static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, @@ -137,8 +138,9 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont u64 buf_addr, int cqe) { int err; + int cqe_size = dev->dev->caps.cqe_size; - *umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe), + *umem = ib_umem_get(context, buf_addr, cqe * cqe_size, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(*umem)) return PTR_ERR(*umem); @@ -331,16 +333,23 @@ static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) { struct mlx4_cqe *cqe, *new_cqe; int i; + int cqe_size = cq->buf.entry_size; + int cqe_inc = cqe_size == 64 ? 1 : 0; i = cq->mcq.cons_index; cqe = get_cqe(cq, i & cq->ibcq.cqe); + cqe += cqe_inc; + while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, (i + 1) & cq->resize_buf->cqe); - memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe)); + memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size); + new_cqe += cqe_inc; + new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); cqe = get_cqe(cq, ++i & cq->ibcq.cqe); + cqe += cqe_inc; } ++cq->mcq.cons_index; } @@ -438,6 +447,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) out: mutex_unlock(&cq->resize_mutex); + return err; } @@ -586,6 +596,9 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, if (!cqe) return -EAGAIN; + if (cq->buf.entry_size == 64) + cqe++; + ++cq->mcq.cons_index; /* @@ -807,6 +820,7 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) int nfreed = 0; struct mlx4_cqe *cqe, *dest; u8 owner_bit; + int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0; /* * First we need to find the current producer index, so we @@ -825,12 +839,16 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) */ while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe += cqe_inc; + if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); ++nfreed; } else if (nfreed) { dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest += cqe_inc; + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; memcpy(dest, cqe, sizeof *cqe); dest->owner_sr_opcode = owner_bit | diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 718ec6b2bad24..e7d81c0d1ac59 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -563,15 +563,24 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; + struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); - resp.qp_tab_size = dev->dev->caps.num_qps; - resp.bf_reg_size = dev->dev->caps.bf_reg_size; - resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + resp_v3.qp_tab_size = dev->dev->caps.num_qps; + resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; + resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + } else { + resp.dev_caps = dev->dev->caps.userspace_caps; + resp.qp_tab_size = dev->dev->caps.num_qps; + resp.bf_reg_size = dev->dev->caps.bf_reg_size; + resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + resp.cqe_size = dev->dev->caps.cqe_size; + } context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) @@ -586,7 +595,11 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); - err = ib_copy_to_udata(udata, &resp, sizeof resp); + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) + err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); + else + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); @@ -1342,7 +1355,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; - ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + if (dev->caps.userspace_caps) + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + else + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; + ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e04cbc9a54a53..dcd845bc30f03 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -90,6 +90,7 @@ struct mlx4_ib_xrcd { struct mlx4_ib_cq_buf { struct mlx4_buf buf; struct mlx4_mtt mtt; + int entry_size; }; struct mlx4_ib_cq_resize { diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h index 13beedeeef9f7..07e6769ef43bc 100644 --- a/drivers/infiniband/hw/mlx4/user.h +++ b/drivers/infiniband/hw/mlx4/user.h @@ -40,7 +40,9 @@ * Increment this value if any changes that break userspace ABI * compatibility are made. */ -#define MLX4_IB_UVERBS_ABI_VERSION 3 + +#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 +#define MLX4_IB_UVERBS_ABI_VERSION 4 /* * Make sure that all structs defined in this file remain laid out so @@ -50,10 +52,18 @@ * instead. */ +struct mlx4_ib_alloc_ucontext_resp_v3 { + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + struct mlx4_ib_alloc_ucontext_resp { + __u32 dev_caps; __u32 qp_tab_size; __u16 bf_reg_size; __u16 bf_regs_per_page; + __u32 cqe_size; }; struct mlx4_ib_alloc_pd_resp { diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 3d1899ff1076e..e791e705f7b11 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1755,7 +1755,7 @@ int mlx4_multi_func_init(struct mlx4_dev *dev) spin_lock_init(&s_state->lock); } - memset(&priv->mfunc.master.cmd_eqe, 0, sizeof(struct mlx4_eqe)); + memset(&priv->mfunc.master.cmd_eqe, 0, dev->caps.eqe_size); priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD; INIT_WORK(&priv->mfunc.master.comm_work, mlx4_master_comm_channel); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index aa9c2f6cf3c0e..b8d0854a7ad1e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -51,7 +51,7 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv, int err; cq->size = entries; - cq->buf_size = cq->size * sizeof(struct mlx4_cqe); + cq->buf_size = cq->size * mdev->dev->caps.cqe_size; cq->ring = ring; cq->is_tx = mode; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index edd9cb8d3e1d8..93a3256695825 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1600,6 +1600,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, goto out; } priv->rx_ring_num = prof->rx_ring_num; + priv->cqe_factor = (mdev->dev->caps.cqe_size == 64) ? 1 : 0; priv->mac_index = -1; priv->msg_enable = MLX4_EN_MSG_LEVEL; spin_lock_init(&priv->stats_lock); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 5aba5ecdf1e28..6fa106f6c0ecf 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -566,6 +566,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud struct ethhdr *ethh; dma_addr_t dma; u64 s_mac; + int factor = priv->cqe_factor; if (!priv->port_up) return 0; @@ -574,7 +575,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud * descriptor offset can be deduced from the CQE index instead of * reading 'cqe->index' */ index = cq->mcq.cons_index & ring->size_mask; - cqe = &cq->buf[index]; + cqe = &cq->buf[(index << factor) + factor]; /* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, @@ -709,7 +710,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud ++cq->mcq.cons_index; index = (cq->mcq.cons_index) & ring->size_mask; - cqe = &cq->buf[index]; + cqe = &cq->buf[(index << factor) + factor]; if (++polled == budget) { /* We are here because we reached the NAPI budget - * flush only pending LRO sessions */ diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index b35094c590ba9..25c157abdd921 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -315,12 +315,13 @@ static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq) struct mlx4_cqe *buf = cq->buf; u32 packets = 0; u32 bytes = 0; + int factor = priv->cqe_factor; if (!priv->port_up) return; index = cons_index & size_mask; - cqe = &buf[index]; + cqe = &buf[(index << factor) + factor]; ring_index = ring->cons & size_mask; /* Process all completed CQEs */ @@ -349,7 +350,7 @@ static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq) ++cons_index; index = cons_index & size_mask; - cqe = &buf[index]; + cqe = &buf[(index << factor) + factor]; } diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index b84a88bc44dc2..c509a86db6101 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -101,15 +101,21 @@ static void eq_set_ci(struct mlx4_eq *eq, int req_not) mb(); } -static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry) +static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor) { - unsigned long off = (entry & (eq->nent - 1)) * MLX4_EQ_ENTRY_SIZE; - return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE; + /* (entry & (eq->nent - 1)) gives us a cyclic array */ + unsigned long offset = (entry & (eq->nent - 1)) * (MLX4_EQ_ENTRY_SIZE << eqe_factor); + /* CX3 is capable of extending the EQE from 32 to 64 bytes. + * When this feature is enabled, the first (in the lower addresses) + * 32 bytes in the 64 byte EQE are reserved and the next 32 bytes + * contain the legacy EQE information. + */ + return eq->page_list[offset / PAGE_SIZE].buf + (offset + (eqe_factor ? MLX4_EQ_ENTRY_SIZE : 0)) % PAGE_SIZE; } -static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq) +static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor) { - struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index); + struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor); return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe; } @@ -177,7 +183,7 @@ static void slave_event(struct mlx4_dev *dev, u8 slave, struct mlx4_eqe *eqe) return; } - memcpy(s_eqe, eqe, sizeof(struct mlx4_eqe) - 1); + memcpy(s_eqe, eqe, dev->caps.eqe_size - 1); s_eqe->slave_id = slave; /* ensure all information is written before setting the ownersip bit */ wmb(); @@ -441,7 +447,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) int i; enum slave_port_gen_event gen_event; - while ((eqe = next_eqe_sw(eq))) { + while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor))) { /* * Make sure we read EQ entry contents after we've * checked the ownership bit. @@ -864,7 +870,8 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent, eq->dev = dev; eq->nent = roundup_pow_of_two(max(nent, 2)); - npages = PAGE_ALIGN(eq->nent * MLX4_EQ_ENTRY_SIZE) / PAGE_SIZE; + /* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes */ + npages = PAGE_ALIGN(eq->nent * (MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor)) / PAGE_SIZE; eq->page_list = kmalloc(npages * sizeof *eq->page_list, GFP_KERNEL); @@ -966,8 +973,9 @@ static void mlx4_free_eq(struct mlx4_dev *dev, struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; int err; - int npages = PAGE_ALIGN(MLX4_EQ_ENTRY_SIZE * eq->nent) / PAGE_SIZE; int i; + /* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes */ + int npages = PAGE_ALIGN((MLX4_EQ_ENTRY_SIZE << dev->caps.eqe_factor) * eq->nent) / PAGE_SIZE; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 4f30b99324cf1..9a9de51ecc91f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -110,6 +110,8 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags) [42] = "Multicast VEP steering support", [48] = "Counters support", [59] = "Port management change event support", + [61] = "64 byte EQE support", + [62] = "64 byte CQE support", }; int i; @@ -235,7 +237,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, field = dev->caps.num_ports; MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET); - size = 0; /* no PF behaviour is set for now */ + size = dev->caps.function_caps; /* set PF behaviours */ MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_PF_BHVR_OFFSET); field = 0; /* protected FMR support not available as yet */ @@ -1237,6 +1239,24 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4); + /* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */ + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_EQE) { + *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 29); + dev->caps.eqe_size = 64; + dev->caps.eqe_factor = 1; + } else { + dev->caps.eqe_size = 32; + dev->caps.eqe_factor = 0; + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) { + *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30); + dev->caps.cqe_size = 64; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; + } else { + dev->caps.cqe_size = 32; + } + /* QPC/EEC/CQC/EQC/RDMARC attributes */ MLX4_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); @@ -1319,6 +1339,7 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox; __be32 *outbox; int err; + u8 byte_field; #define QUERY_HCA_GLOBAL_CAPS_OFFSET 0x04 @@ -1370,6 +1391,13 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); } + /* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */ + MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_OFFSETS); + if (byte_field & 0x20) /* 64-bytes eqe enabled */ + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED; + if (byte_field & 0x40) /* 64-bytes cqe enabled */ + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; + /* TPT attributes */ MLX4_GET(param->dmpt_base, outbox, INIT_HCA_DMPT_BASE_OFFSET); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index 85abe9c11a226..2c2e7ade2a34c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -172,6 +172,7 @@ struct mlx4_init_hca_param { u8 log_uar_sz; u8 uar_page_sz; /* log pg sz in 4k chunks */ u8 fs_hash_enable_bits; + u64 dev_cap_enabled; }; struct mlx4_init_ib_param { diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 2aa80afd98d20..4337f685175de 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -95,8 +95,14 @@ MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num" " Not in use with device managed" " flow steering"); +static bool enable_64b_cqe_eqe; +module_param(enable_64b_cqe_eqe, bool, 0444); +MODULE_PARM_DESC(enable_64b_cqe_eqe, + "Enable 64 byte CQEs/EQEs when the the FW supports this"); + #define HCA_GLOBAL_CAP_MASK 0 -#define PF_CONTEXT_BEHAVIOUR_MASK 0 + +#define PF_CONTEXT_BEHAVIOUR_MASK MLX4_FUNC_CAP_64B_EQE_CQE static char mlx4_version[] __devinitdata = DRV_NAME ": Mellanox ConnectX core driver v" @@ -386,6 +392,21 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH]; dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0; + + if (!enable_64b_cqe_eqe) { + if (dev_cap->flags & + (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) { + mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n"); + dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; + dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; + } + } + + if ((dev_cap->flags & + (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) && + mlx4_is_master(dev)) + dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE; + return 0; } /*The function checks if there are live vf, return the num of them*/ @@ -599,6 +620,21 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) goto err_mem; } + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) { + dev->caps.eqe_size = 64; + dev->caps.eqe_factor = 1; + } else { + dev->caps.eqe_size = 32; + dev->caps.eqe_factor = 0; + } + + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) { + dev->caps.cqe_size = 64; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; + } else { + dev->caps.cqe_size = 32; + } + return 0; err_mem: diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 9d27e42264e2b..73b5c2ac5bd54 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -487,6 +487,7 @@ struct mlx4_en_priv { int mac_index; unsigned max_mtu; int base_qpn; + int cqe_factor; struct mlx4_en_rss_map rss_map; __be32 ctrl_flags; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 6d1acb04cd17c..21821da2abfd3 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -142,6 +142,8 @@ enum { MLX4_DEV_CAP_FLAG_COUNTERS = 1LL << 48, MLX4_DEV_CAP_FLAG_SENSE_SUPPORT = 1LL << 55, MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV = 1LL << 59, + MLX4_DEV_CAP_FLAG_64B_EQE = 1LL << 61, + MLX4_DEV_CAP_FLAG_64B_CQE = 1LL << 62 }; enum { @@ -151,6 +153,20 @@ enum { MLX4_DEV_CAP_FLAG2_FS_EN = 1LL << 3 }; +enum { + MLX4_DEV_CAP_64B_EQE_ENABLED = 1LL << 0, + MLX4_DEV_CAP_64B_CQE_ENABLED = 1LL << 1 +}; + +enum { + MLX4_USER_DEV_CAP_64B_CQE = 1L << 0 +}; + +enum { + MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0 +}; + + #define MLX4_ATTR_EXTENDED_PORT_INFO cpu_to_be16(0xff90) enum { @@ -419,6 +435,11 @@ struct mlx4_caps { u32 max_counters; u8 port_ib_mtu[MLX4_MAX_PORTS + 1]; u16 sqp_demux; + u32 eqe_size; + u32 cqe_size; + u8 eqe_factor; + u32 userspace_caps; /* userspace must be aware of these */ + u32 function_caps; /* VFs must be aware of these */ }; struct mlx4_buf_list { From 76f267b7ad978fec755f74ada4020edcb7493657 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 10:58:27 +0000 Subject: [PATCH 10/33] RDMA/cxgb4: use WARN Use WARN rather than printk followed by WARN_ON(1), for conciseness. A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression list es; @@ -printk( +WARN(1, es); -WARN_ON(1); // Signed-off-by: Julia Lawall Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 6cfd4d8fd0bd8..5de86968379d2 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -151,9 +151,8 @@ static void stop_ep_timer(struct c4iw_ep *ep) { PDBG("%s ep %p\n", __func__, ep); if (!timer_pending(&ep->timer)) { - printk(KERN_ERR "%s timer stopped when its not running! " + WARN(1, "%s timer stopped when its not running! " "ep %p state %u\n", __func__, ep, ep->com.state); - WARN_ON(1); return; } del_timer_sync(&ep->timer); @@ -2551,9 +2550,8 @@ static void process_timeout(struct c4iw_ep *ep) __state_set(&ep->com, ABORTING); break; default: - printk(KERN_ERR "%s unexpected state ep %p tid %u state %u\n", + WARN(1, "%s unexpected state ep %p tid %u state %u\n", __func__, ep, ep->hwtid, ep->com.state); - WARN_ON(1); abort = 0; } mutex_unlock(&ep->com.mutex); From 5107c2a3d117de8219a53e622c0f1f1bc3f7d1ae Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 10:58:29 +0000 Subject: [PATCH 11/33] RDMA/cxgb3: use WARN Use WARN rather than printk followed by WARN_ON(1), for conciseness. A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression list es; @@ -printk( +WARN(1, es); -WARN_ON(1); // Signed-off-by: Julia Lawall Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/iwch_cm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index aaf88ef9409cd..3e094cd6a0e34 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -128,9 +128,8 @@ static void stop_ep_timer(struct iwch_ep *ep) { PDBG("%s ep %p\n", __func__, ep); if (!timer_pending(&ep->timer)) { - printk(KERN_ERR "%s timer stopped when its not running! ep %p state %u\n", + WARN(1, "%s timer stopped when its not running! ep %p state %u\n", __func__, ep, ep->com.state); - WARN_ON(1); return; } del_timer_sync(&ep->timer); @@ -1756,9 +1755,8 @@ static void ep_timeout(unsigned long arg) __state_set(&ep->com, ABORTING); break; default: - printk(KERN_ERR "%s unexpected state ep %p state %u\n", + WARN(1, "%s unexpected state ep %p state %u\n", __func__, ep, ep->com.state); - WARN_ON(1); abort = 0; } spin_unlock_irqrestore(&ep->com.lock, flags); From ceb7decb366591e9b67d70832e07f5d240572a3d Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 27 Nov 2012 16:24:29 +0000 Subject: [PATCH 12/33] IB/mlx4: Fix spinlock order to avoid lockdep warnings lockdep warns about taking a hard-irq-unsafe lock (sriov->id_map_lock) inside a hard-irq-safe lock (sriov->going_down_lock). Since id_map_lock is never taken in the interrupt context, we can simply reverse the order of taking the two spinlocks, thus avoiding the warning and the depencency. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Cc: Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/cm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index 80079e5a2e30f..dbc99d41605cf 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -268,15 +268,15 @@ static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; unsigned long flags; - spin_lock_irqsave(&sriov->going_down_lock, flags); spin_lock(&sriov->id_map_lock); + spin_lock_irqsave(&sriov->going_down_lock, flags); /*make sure that there is no schedule inside the scheduled work.*/ if (!sriov->is_going_down) { id->scheduled_delete = 1; schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); } - spin_unlock(&sriov->id_map_lock); spin_unlock_irqrestore(&sriov->going_down_lock, flags); + spin_unlock(&sriov->id_map_lock); } int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, From 311f813a2daefcba03f706a692fe0c67888d7622 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 27 Nov 2012 16:24:30 +0000 Subject: [PATCH 13/33] mlx4_core: Fix potential deadlock in mlx4_eq_int() The slave_state_lock spinlock is used in both interrupt context and process context, hence irq locking must be used. Found by lockdep. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Cc: Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 9 +++++---- drivers/net/ethernet/mellanox/mlx4/eq.c | 10 ++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index e791e705f7b11..fdc5f23d8e9f2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1498,6 +1498,7 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd, u32 reply; u8 is_going_down = 0; int i; + unsigned long flags; slave_state[slave].comm_toggle ^= 1; reply = (u32) slave_state[slave].comm_toggle << 31; @@ -1576,12 +1577,12 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd, mlx4_warn(dev, "Bad comm cmd:%d from slave:%d\n", cmd, slave); goto reset_slave; } - spin_lock(&priv->mfunc.master.slave_state_lock); + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); if (!slave_state[slave].is_slave_going_down) slave_state[slave].last_cmd = cmd; else is_going_down = 1; - spin_unlock(&priv->mfunc.master.slave_state_lock); + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); if (is_going_down) { mlx4_warn(dev, "Slave is going down aborting command(%d)" " executing from slave:%d\n", @@ -1597,10 +1598,10 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd, reset_slave: /* cleanup any slave resources */ mlx4_delete_all_resources_for_slave(dev, slave); - spin_lock(&priv->mfunc.master.slave_state_lock); + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); if (!slave_state[slave].is_slave_going_down) slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET; - spin_unlock(&priv->mfunc.master.slave_state_lock); + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); /*with slave in the middle of flr, no need to clean resources again.*/ inform_slave_state: memset(&slave_state[slave].event_eq, 0, diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index c509a86db6101..3cb7727408e5b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -407,6 +407,7 @@ void mlx4_master_handle_slave_flr(struct work_struct *work) struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state; int i; int err; + unsigned long flags; mlx4_dbg(dev, "mlx4_handle_slave_flr\n"); @@ -418,10 +419,10 @@ void mlx4_master_handle_slave_flr(struct work_struct *work) mlx4_delete_all_resources_for_slave(dev, i); /*return the slave to running mode*/ - spin_lock(&priv->mfunc.master.slave_state_lock); + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); slave_state[i].last_cmd = MLX4_COMM_CMD_RESET; slave_state[i].is_slave_going_down = 0; - spin_unlock(&priv->mfunc.master.slave_state_lock); + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); /*notify the FW:*/ err = mlx4_cmd(dev, 0, i, 0, MLX4_CMD_INFORM_FLR_DONE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); @@ -446,6 +447,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) u8 update_slave_state; int i; enum slave_port_gen_event gen_event; + unsigned long flags; while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor))) { /* @@ -653,13 +655,13 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) } else update_slave_state = 1; - spin_lock(&priv->mfunc.master.slave_state_lock); + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); if (update_slave_state) { priv->mfunc.master.slave_state[flr_slave].active = false; priv->mfunc.master.slave_state[flr_slave].last_cmd = MLX4_COMM_CMD_FLR; priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1; } - spin_unlock(&priv->mfunc.master.slave_state_lock); + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); queue_work(priv->mfunc.master.comm_wq, &priv->mfunc.master.slave_flr_event_work); break; From 63f05be2c075022d1b3227037f82ad75c4d5ab1d Mon Sep 17 00:00:00 2001 From: shefty Date: Wed, 28 Nov 2012 20:39:52 +0000 Subject: [PATCH 14/33] RDMA/cm: Change return value from find_gid_port() Problem reported by Dan Carpenter : The patch 3c86aa70bf67: "RDMA/cm: Add RDMA CM support for IBoE devices" from Oct 13, 2010, leads to the following warning: net/sunrpc/xprtrdma/svc_rdma_transport.c:722 svc_rdma_create() error: passing non neg 1 to ERR_PTR This bug would result in a NULL dereference. svc_rdma_create() is supposed to return ERR_PTRs or valid pointers, but instead it returns ERR_PTRs, valid pointers and 1. The call tree is: svc_rdma_create() => rdma_bind_addr() => cma_acquire_dev() => find_gid_port() rdma_bind_addr() should return a valid errno. Fix this by having find_gid_port() also return a valid errno. If we can't find the specified GID on a given port, return -EADDRNOTAVAIL, rather than -EAGAIN, to better indicate the error. We also drop using the special return value of '1' and instead pass through the error returned by the underlying verbs call. On such errors, rather than aborting the search, we simply continue to check the next device/port. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a7568c34a1aa6..d789eea321687 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -345,17 +345,17 @@ static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_nu err = ib_query_port(device, port_num, &props); if (err) - return 1; + return err; for (i = 0; i < props.gid_tbl_len; ++i) { err = ib_query_gid(device, port_num, i, &tmp); if (err) - return 1; + return err; if (!memcmp(&tmp, gid, sizeof tmp)) return 0; } - return -EAGAIN; + return -EADDRNOTAVAIL; } static int cma_acquire_dev(struct rdma_id_private *id_priv) @@ -388,8 +388,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) if (!ret) { id_priv->id.port_num = port; goto out; - } else if (ret == 1) - break; + } } } } From c9b03c1ae55decc721310b79d8f50d44fbb37dc7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 3 Sep 2011 09:34:48 +0200 Subject: [PATCH 15/33] IB/srp: Increase block layer timeout Increase the block layer timeout for disks so that it is above the InfiniBand transport layer timeout. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 45 +++++++++++++++++++++++++++++ drivers/infiniband/ulp/srp/ib_srp.h | 2 ++ 2 files changed, 47 insertions(+) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 922d845f76b0a..5aa70e96ec902 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1419,6 +1419,33 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) return -ENOMEM; } +static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask) +{ + uint64_t T_tr_ns, max_compl_time_ms; + uint32_t rq_tmo_jiffies; + + /* + * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair, + * table 91), both the QP timeout and the retry count have to be set + * for RC QP's during the RTR to RTS transition. + */ + WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) != + (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)); + + /* + * Set target->rq_tmo_jiffies to one second more than the largest time + * it can take before an error completion is generated. See also + * C9-140..142 in the IBTA spec for more information about how to + * convert the QP Local ACK Timeout value to nanoseconds. + */ + T_tr_ns = 4096 * (1ULL << qp_attr->timeout); + max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns; + do_div(max_compl_time_ms, NSEC_PER_MSEC); + rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000); + + return rq_tmo_jiffies; +} + static void srp_cm_rep_handler(struct ib_cm_id *cm_id, struct srp_login_rsp *lrsp, struct srp_target_port *target) @@ -1478,6 +1505,8 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, if (ret) goto error_free; + target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + ret = ib_modify_qp(target->qp, qp_attr, attr_mask); if (ret) goto error_free; @@ -1729,6 +1758,21 @@ static int srp_reset_host(struct scsi_cmnd *scmnd) return ret; } +static int srp_slave_configure(struct scsi_device *sdev) +{ + struct Scsi_Host *shost = sdev->host; + struct srp_target_port *target = host_to_target(shost); + struct request_queue *q = sdev->request_queue; + unsigned long timeout; + + if (sdev->type == TYPE_DISK) { + timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies); + blk_queue_rq_timeout(q, timeout); + } + + return 0; +} + static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1861,6 +1905,7 @@ static struct scsi_host_template srp_template = { .module = THIS_MODULE, .name = "InfiniBand SRP initiator", .proc_name = DRV_NAME, + .slave_configure = srp_slave_configure, .info = srp_target_info, .queuecommand = srp_queuecommand, .eh_abort_handler = srp_abort, diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 020caf0c3789e..e3a6304ba87bc 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -163,6 +163,8 @@ struct srp_target_port { struct ib_sa_query *path_query; int path_query_id; + u32 rq_tmo_jiffies; + struct ib_cm_id *cm_id; int max_ti_iu_len; From 09be70a238005cc33f2a52b0aeae52f117e81582 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 17 Mar 2012 17:18:54 +0000 Subject: [PATCH 16/33] IB/srp: Eliminate state SRP_TARGET_CONNECTING Block the SCSI host while reconnecting instead of representing the reconnection activity as a distinct SRP target state. This allows us to eliminate the target state SRP_TARGET_CONNECTING. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 25 ++++++++++++++----------- drivers/infiniband/ulp/srp/ib_srp.h | 1 - 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 5aa70e96ec902..a2261995c5507 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -646,13 +646,16 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re static int srp_reconnect_target(struct srp_target_port *target) { + struct Scsi_Host *shost = target->scsi_host; struct ib_qp_attr qp_attr; struct ib_wc wc; int i, ret; - if (!srp_change_state(target, SRP_TARGET_LIVE, SRP_TARGET_CONNECTING)) + if (target->state != SRP_TARGET_LIVE) return -EAGAIN; + scsi_target_block(&shost->shost_gendev); + srp_disconnect_target(target); /* * Now get a new local CM ID so that we avoid confusing the @@ -660,16 +663,16 @@ static int srp_reconnect_target(struct srp_target_port *target) */ ret = srp_new_cm_id(target); if (ret) - goto err; + goto unblock; qp_attr.qp_state = IB_QPS_RESET; ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); if (ret) - goto err; + goto unblock; ret = srp_init_qp(target, target->qp); if (ret) - goto err; + goto unblock; while (ib_poll_cq(target->recv_cq, 1, &wc) > 0) ; /* nothing */ @@ -688,11 +691,15 @@ static int srp_reconnect_target(struct srp_target_port *target) target->qp_in_error = 0; ret = srp_connect_target(target); + +unblock: + scsi_target_unblock(&shost->shost_gendev, ret == 0 ? SDEV_RUNNING : + SDEV_TRANSPORT_OFFLINE); + if (ret) goto err; - if (!srp_change_state(target, SRP_TARGET_CONNECTING, SRP_TARGET_LIVE)) - ret = -EAGAIN; + shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n"); return ret; @@ -710,7 +717,7 @@ static int srp_reconnect_target(struct srp_target_port *target) * the flush_scheduled_work() in srp_remove_one(). */ spin_lock_irq(&target->lock); - if (target->state == SRP_TARGET_CONNECTING) { + if (target->state == SRP_TARGET_LIVE) { target->state = SRP_TARGET_DEAD; INIT_WORK(&target->work, srp_remove_work); queue_work(ib_wq, &target->work); @@ -1311,9 +1318,6 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) unsigned long flags; int len; - if (target->state == SRP_TARGET_CONNECTING) - goto err; - if (target->state == SRP_TARGET_DEAD || target->state == SRP_TARGET_REMOVED) { scmnd->result = DID_BAD_TARGET << 16; @@ -1377,7 +1381,6 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) err_unlock: spin_unlock_irqrestore(&target->lock, flags); -err: return SCSI_MLQUEUE_HOST_BUSY; } diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index e3a6304ba87bc..8b436cee46ad2 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -80,7 +80,6 @@ enum { enum srp_target_state { SRP_TARGET_LIVE, - SRP_TARGET_CONNECTING, SRP_TARGET_DEAD, SRP_TARGET_REMOVED }; From f3718231203a165dbfd5c5bd0d9a6db522bb73e3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 19 Apr 2012 14:42:54 +0000 Subject: [PATCH 17/33] IB/srp: Keep processing commands during host removal Some SCSI upper layer drivers, e.g. sd, issue SCSI commands from inside scsi_remove_host() (see the sd_shutdown() call in sd_remove()). Make sure that these commands have a chance to reach the SCSI device. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index a2261995c5507..9371d582d0dcb 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1318,13 +1318,6 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) unsigned long flags; int len; - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) { - scmnd->result = DID_BAD_TARGET << 16; - scmnd->scsi_done(scmnd); - return 0; - } - spin_lock_irqsave(&target->lock, flags); iu = __srp_get_tx_iu(target, SRP_IU_CMD); if (!iu) From 224db157431299582a65fb9d4c9228363146f8a9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 24 Oct 2012 15:05:37 +0200 Subject: [PATCH 18/33] IB/srp: Simplify SCSI error handling Since scsi_remove_host() has been modified so that SCSI error handling functions will no longer be invoked after scsi_remove_host() returns, the test at the start of srp_send_tsk_mgmt() is now superfluous. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9371d582d0dcb..baafee085e337 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1661,10 +1661,6 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -1; - init_completion(&target->tsk_mgmt_done); spin_lock_irq(&target->lock); From 948d1e889e5bd6ceffcc31db2beba39331087df3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 3 Sep 2011 09:25:42 +0200 Subject: [PATCH 19/33] IB/srp: Introduce srp_handle_qp_err() Introduce the function srp_handle_qp_err(), change the type of qp_in_error from int into bool and move the initialization of that variable from srp_reconnect_target() to srp_connect_target(). Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 37 ++++++++++++++++------------- drivers/infiniband/ulp/srp/ib_srp.h | 2 +- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index baafee085e337..c28833070201a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -515,6 +515,8 @@ static int srp_connect_target(struct srp_target_port *target) int retries = 3; int ret; + target->qp_in_error = false; + ret = srp_lookup_path(target); if (ret) return ret; @@ -689,7 +691,6 @@ static int srp_reconnect_target(struct srp_target_port *target) for (i = 0; i < SRP_SQ_SIZE; ++i) list_add(&target->tx_ring[i]->list, &target->free_tx); - target->qp_in_error = 0; ret = srp_connect_target(target); unblock: @@ -1269,6 +1270,15 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) PFX "Recv failed with error code %d\n", res); } +static void srp_handle_qp_err(enum ib_wc_status wc_status, + enum ib_wc_opcode wc_opcode, + struct srp_target_port *target) +{ + shost_printk(KERN_ERR, target->scsi_host, PFX "failed %s status %d\n", + wc_opcode & IB_WC_RECV ? "receive" : "send", wc_status); + target->qp_in_error = true; +} + static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) { struct srp_target_port *target = target_ptr; @@ -1276,15 +1286,12 @@ static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(cq, 1, &wc) > 0) { - if (wc.status) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "failed receive status %d\n", - wc.status); - target->qp_in_error = 1; + if (likely(wc.status == IB_WC_SUCCESS)) { + srp_handle_recv(target, &wc); + } else { + srp_handle_qp_err(wc.status, wc.opcode, target); break; } - - srp_handle_recv(target, &wc); } } @@ -1295,16 +1302,13 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) struct srp_iu *iu; while (ib_poll_cq(cq, 1, &wc) > 0) { - if (wc.status) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "failed send status %d\n", - wc.status); - target->qp_in_error = 1; + if (likely(wc.status == IB_WC_SUCCESS)) { + iu = (struct srp_iu *) (uintptr_t) wc.wr_id; + list_add(&iu->list, &target->free_tx); + } else { + srp_handle_qp_err(wc.status, wc.opcode, target); break; } - - iu = (struct srp_iu *) (uintptr_t) wc.wr_id; - list_add(&iu->list, &target->free_tx); } } @@ -2269,7 +2273,6 @@ static ssize_t srp_create_target(struct device *dev, if (ret) goto err_free_ib; - target->qp_in_error = 0; ret = srp_connect_target(target); if (ret) { shost_printk(KERN_ERR, target->scsi_host, diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 8b436cee46ad2..02dc3acb718c5 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -179,7 +179,7 @@ struct srp_target_port { struct list_head list; struct completion done; int status; - int qp_in_error; + bool qp_in_error; struct completion tsk_mgmt_done; u8 tsk_mgmt_status; From 4f0af69799b0e7b9805c4d174169f5ed7bf315c1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 26 Nov 2012 11:16:40 +0100 Subject: [PATCH 20/33] IB/srp: Process all error completions If the RDMA RC connection is closed, tell the SCSI mid-layer to terminate all pending commands instead of only the first. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index c28833070201a..f0843ef28e142 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1274,8 +1274,12 @@ static void srp_handle_qp_err(enum ib_wc_status wc_status, enum ib_wc_opcode wc_opcode, struct srp_target_port *target) { - shost_printk(KERN_ERR, target->scsi_host, PFX "failed %s status %d\n", - wc_opcode & IB_WC_RECV ? "receive" : "send", wc_status); + if (!target->qp_in_error) { + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %d\n", + wc_opcode & IB_WC_RECV ? "receive" : "send", + wc_status); + } target->qp_in_error = true; } @@ -1290,7 +1294,6 @@ static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) srp_handle_recv(target, &wc); } else { srp_handle_qp_err(wc.status, wc.opcode, target); - break; } } } @@ -1307,7 +1310,6 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) list_add(&iu->list, &target->free_tx); } else { srp_handle_qp_err(wc.status, wc.opcode, target); - break; } } } From 294c875a65269361defd7aeb97804ba99eb57cbf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 25 Dec 2011 12:18:12 +0000 Subject: [PATCH 21/33] IB/srp: Suppress superfluous error messages Keep track of the connection state. Only report QP errors while connected. Only invoke ib_send_cm_dreq() when connected so that invoking srp_disconnect_target() after having received a DREQ does not cause an error message to be printed. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 38 +++++++++++++++++++++++------ drivers/infiniband/ulp/srp/ib_srp.h | 1 + 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index f0843ef28e142..3c64bf400c20f 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -428,17 +428,34 @@ static int srp_send_req(struct srp_target_port *target) return status; } +static bool srp_change_conn_state(struct srp_target_port *target, + bool connected) +{ + bool changed = false; + + spin_lock_irq(&target->lock); + if (target->connected != connected) { + target->connected = connected; + changed = true; + } + spin_unlock_irq(&target->lock); + + return changed; +} + static void srp_disconnect_target(struct srp_target_port *target) { - /* XXX should send SRP_I_LOGOUT request */ + if (srp_change_conn_state(target, false)) { + /* XXX should send SRP_I_LOGOUT request */ - init_completion(&target->done); - if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { - shost_printk(KERN_DEBUG, target->scsi_host, - PFX "Sending CM DREQ failed\n"); - return; + init_completion(&target->done); + if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM DREQ failed\n"); + } else { + wait_for_completion(&target->done); + } } - wait_for_completion(&target->done); } static bool srp_change_state(struct srp_target_port *target, @@ -515,6 +532,8 @@ static int srp_connect_target(struct srp_target_port *target) int retries = 3; int ret; + WARN_ON_ONCE(target->connected); + target->qp_in_error = false; ret = srp_lookup_path(target); @@ -536,6 +555,7 @@ static int srp_connect_target(struct srp_target_port *target) */ switch (target->status) { case 0: + srp_change_conn_state(target, true); return 0; case SRP_PORT_REDIRECT: @@ -1274,7 +1294,7 @@ static void srp_handle_qp_err(enum ib_wc_status wc_status, enum ib_wc_opcode wc_opcode, struct srp_target_port *target) { - if (!target->qp_in_error) { + if (target->connected && !target->qp_in_error) { shost_printk(KERN_ERR, target->scsi_host, PFX "failed %s status %d\n", wc_opcode & IB_WC_RECV ? "receive" : "send", @@ -1630,6 +1650,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) case IB_CM_DREQ_RECEIVED: shost_printk(KERN_WARNING, target->scsi_host, PFX "DREQ received - connection closed\n"); + srp_change_conn_state(target, false); if (ib_send_cm_drep(cm_id, NULL, 0)) shost_printk(KERN_ERR, target->scsi_host, PFX "Sending CM DREP failed\n"); @@ -1942,6 +1963,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) spin_unlock(&host->target_lock); target->state = SRP_TARGET_LIVE; + target->connected = false; scsi_scan_target(&target->scsi_host->shost_gendev, 0, target->scsi_id, SCAN_WILD_CARD, 0); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 02dc3acb718c5..ef95fa4ca3ef4 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -163,6 +163,7 @@ struct srp_target_port { int path_query_id; u32 rq_tmo_jiffies; + bool connected; struct ib_cm_id *cm_id; From ee12d6a80cfcd08b862ed3c8e109442e466b0302 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 25 Dec 2011 19:41:07 +0000 Subject: [PATCH 22/33] IB/srp: Introduce the helper function srp_remove_target() Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 3c64bf400c20f..beb68786001e8 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -506,6 +506,17 @@ static void srp_del_scsi_host_attr(struct Scsi_Host *shost) device_remove_file(&shost->shost_dev, *attr); } +static void srp_remove_target(struct srp_target_port *target) +{ + srp_del_scsi_host_attr(target->scsi_host); + srp_remove_host(target->scsi_host); + scsi_remove_host(target->scsi_host); + ib_destroy_cm_id(target->cm_id); + srp_free_target_ib(target); + srp_free_req_data(target); + scsi_host_put(target->scsi_host); +} + static void srp_remove_work(struct work_struct *work) { struct srp_target_port *target = @@ -518,13 +529,7 @@ static void srp_remove_work(struct work_struct *work) list_del(&target->list); spin_unlock(&target->srp_host->target_lock); - srp_del_scsi_host_attr(target->scsi_host); - srp_remove_host(target->scsi_host); - scsi_remove_host(target->scsi_host); - ib_destroy_cm_id(target->cm_id); - srp_free_target_ib(target); - srp_free_req_data(target); - scsi_host_put(target->scsi_host); + srp_remove_target(target); } static int srp_connect_target(struct srp_target_port *target) From ef6c49d87c3418c442a22e55e3ce2f91b163d69e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 26 Dec 2011 16:49:18 +0000 Subject: [PATCH 23/33] IB/srp: Eliminate state SRP_TARGET_DEAD Only queue removal work after having changed the target state into SRP_TARGET_REMOVED and not if that state was already equal to SRP_TARGET_REMOVED. That allows us to remove the state SRP_TARGET_DEAD. Add a call to srp_disconnect_target() in srp_remove_target() -- due to previous changes it is now safe to invoke that function even if the IB connection has already been disconnected. This change allows us to replace the target removal code in srp_remove_one() by an (indirect) call to srp_remove_target(). Rename srp_target_port.work into srp_target_port.remove_work to reflect its usage. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 83 +++++++++++------------------ drivers/infiniband/ulp/srp/ib_srp.h | 5 +- 2 files changed, 32 insertions(+), 56 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index beb68786001e8..95590a38e88a2 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -428,6 +428,23 @@ static int srp_send_req(struct srp_target_port *target) return status; } +static bool srp_queue_remove_work(struct srp_target_port *target) +{ + bool changed = false; + + spin_lock_irq(&target->lock); + if (target->state != SRP_TARGET_REMOVED) { + target->state = SRP_TARGET_REMOVED; + changed = true; + } + spin_unlock_irq(&target->lock); + + if (changed) + queue_work(system_long_wq, &target->remove_work); + + return changed; +} + static bool srp_change_conn_state(struct srp_target_port *target, bool connected) { @@ -458,21 +475,6 @@ static void srp_disconnect_target(struct srp_target_port *target) } } -static bool srp_change_state(struct srp_target_port *target, - enum srp_target_state old, - enum srp_target_state new) -{ - bool changed = false; - - spin_lock_irq(&target->lock); - if (target->state == old) { - target->state = new; - changed = true; - } - spin_unlock_irq(&target->lock); - return changed; -} - static void srp_free_req_data(struct srp_target_port *target) { struct ib_device *ibdev = target->srp_host->srp_dev->dev; @@ -508,9 +510,12 @@ static void srp_del_scsi_host_attr(struct Scsi_Host *shost) static void srp_remove_target(struct srp_target_port *target) { + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + srp_del_scsi_host_attr(target->scsi_host); srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); + srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); srp_free_req_data(target); @@ -520,10 +525,9 @@ static void srp_remove_target(struct srp_target_port *target) static void srp_remove_work(struct work_struct *work) { struct srp_target_port *target = - container_of(work, struct srp_target_port, work); + container_of(work, struct srp_target_port, remove_work); - if (!srp_change_state(target, SRP_TARGET_DEAD, SRP_TARGET_REMOVED)) - return; + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); spin_lock(&target->srp_host->target_lock); list_del(&target->list); @@ -738,17 +742,8 @@ static int srp_reconnect_target(struct srp_target_port *target) * However, we have to defer the real removal because we * are in the context of the SCSI error handler now, which * will deadlock if we call scsi_remove_host(). - * - * Schedule our work inside the lock to avoid a race with - * the flush_scheduled_work() in srp_remove_one(). */ - spin_lock_irq(&target->lock); - if (target->state == SRP_TARGET_LIVE) { - target->state = SRP_TARGET_DEAD; - INIT_WORK(&target->work, srp_remove_work); - queue_work(ib_wq, &target->work); - } - spin_unlock_irq(&target->lock); + srp_queue_remove_work(target); return ret; } @@ -2258,6 +2253,7 @@ static ssize_t srp_create_target(struct device *dev, sizeof (struct srp_indirect_buf) + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + INIT_WORK(&target->remove_work, srp_remove_work); spin_lock_init(&target->lock); INIT_LIST_HEAD(&target->free_tx); INIT_LIST_HEAD(&target->free_reqs); @@ -2491,8 +2487,7 @@ static void srp_remove_one(struct ib_device *device) { struct srp_device *srp_dev; struct srp_host *host, *tmp_host; - LIST_HEAD(target_list); - struct srp_target_port *target, *tmp_target; + struct srp_target_port *target; srp_dev = ib_get_client_data(device, &srp_client); @@ -2505,35 +2500,17 @@ static void srp_remove_one(struct ib_device *device) wait_for_completion(&host->released); /* - * Mark all target ports as removed, so we stop queueing - * commands and don't try to reconnect. + * Remove all target ports. */ spin_lock(&host->target_lock); - list_for_each_entry(target, &host->target_list, list) { - spin_lock_irq(&target->lock); - target->state = SRP_TARGET_REMOVED; - spin_unlock_irq(&target->lock); - } + list_for_each_entry(target, &host->target_list, list) + srp_queue_remove_work(target); spin_unlock(&host->target_lock); /* - * Wait for any reconnection tasks that may have - * started before we marked our target ports as - * removed, and any target port removal tasks. + * Wait for target port removal tasks. */ - flush_workqueue(ib_wq); - - list_for_each_entry_safe(target, tmp_target, - &host->target_list, list) { - srp_del_scsi_host_attr(target->scsi_host); - srp_remove_host(target->scsi_host); - scsi_remove_host(target->scsi_host); - srp_disconnect_target(target); - ib_destroy_cm_id(target->cm_id); - srp_free_target_ib(target); - srp_free_req_data(target); - scsi_host_put(target->scsi_host); - } + flush_workqueue(system_long_wq); kfree(host); } diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index ef95fa4ca3ef4..de2d0b3c0bfe4 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -80,8 +80,7 @@ enum { enum srp_target_state { SRP_TARGET_LIVE, - SRP_TARGET_DEAD, - SRP_TARGET_REMOVED + SRP_TARGET_REMOVED, }; enum srp_iu_type { @@ -175,7 +174,7 @@ struct srp_target_port { struct srp_iu *rx_ring[SRP_RQ_SIZE]; struct srp_request req_ring[SRP_CMD_SQ_SIZE]; - struct work_struct work; + struct work_struct remove_work; struct list_head list; struct completion done; From 73aa89ed9e2bebf0c3fff4504e6dff1421b5c819 Mon Sep 17 00:00:00 2001 From: Ishai Rabinovitz Date: Mon, 26 Nov 2012 11:44:53 +0100 Subject: [PATCH 24/33] IB/srp: destroy and recreate QP and CQs when reconnecting HW QP FATAL errors persist over a reset operation, but we can recover from that by recreating the QP and associated CQs for each connection. Creating a new QP/CQ also completely forecloses any possibility of getting stale completions or packets on the new connection. Signed-off-by: Ishai Rabinovitz Signed-off-by: Michael S. Tsirkin [ updated to current code from OFED, cleaned up commit message ] Signed-off-by: David Dillow Signed-off-by: Bart Van Assche Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 66 +++++++++++++++-------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 95590a38e88a2..85771eba9c80e 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -222,27 +222,29 @@ static int srp_new_cm_id(struct srp_target_port *target) static int srp_create_target_ib(struct srp_target_port *target) { struct ib_qp_init_attr *init_attr; + struct ib_cq *recv_cq, *send_cq; + struct ib_qp *qp; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); if (!init_attr) return -ENOMEM; - target->recv_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0); - if (IS_ERR(target->recv_cq)) { - ret = PTR_ERR(target->recv_cq); + recv_cq = ib_create_cq(target->srp_host->srp_dev->dev, + srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0); + if (IS_ERR(recv_cq)) { + ret = PTR_ERR(recv_cq); goto err; } - target->send_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_send_completion, NULL, target, SRP_SQ_SIZE, 0); - if (IS_ERR(target->send_cq)) { - ret = PTR_ERR(target->send_cq); + send_cq = ib_create_cq(target->srp_host->srp_dev->dev, + srp_send_completion, NULL, target, SRP_SQ_SIZE, 0); + if (IS_ERR(send_cq)) { + ret = PTR_ERR(send_cq); goto err_recv_cq; } - ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP); init_attr->event_handler = srp_qp_event; init_attr->cap.max_send_wr = SRP_SQ_SIZE; @@ -251,30 +253,41 @@ static int srp_create_target_ib(struct srp_target_port *target) init_attr->cap.max_send_sge = 1; init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; init_attr->qp_type = IB_QPT_RC; - init_attr->send_cq = target->send_cq; - init_attr->recv_cq = target->recv_cq; + init_attr->send_cq = send_cq; + init_attr->recv_cq = recv_cq; - target->qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr); - if (IS_ERR(target->qp)) { - ret = PTR_ERR(target->qp); + qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto err_send_cq; } - ret = srp_init_qp(target, target->qp); + ret = srp_init_qp(target, qp); if (ret) goto err_qp; + if (target->qp) + ib_destroy_qp(target->qp); + if (target->recv_cq) + ib_destroy_cq(target->recv_cq); + if (target->send_cq) + ib_destroy_cq(target->send_cq); + + target->qp = qp; + target->recv_cq = recv_cq; + target->send_cq = send_cq; + kfree(init_attr); return 0; err_qp: - ib_destroy_qp(target->qp); + ib_destroy_qp(qp); err_send_cq: - ib_destroy_cq(target->send_cq); + ib_destroy_cq(send_cq); err_recv_cq: - ib_destroy_cq(target->recv_cq); + ib_destroy_cq(recv_cq); err: kfree(init_attr); @@ -289,6 +302,9 @@ static void srp_free_target_ib(struct srp_target_port *target) ib_destroy_cq(target->send_cq); ib_destroy_cq(target->recv_cq); + target->qp = NULL; + target->send_cq = target->recv_cq = NULL; + for (i = 0; i < SRP_RQ_SIZE; ++i) srp_free_iu(target->srp_host, target->rx_ring[i]); for (i = 0; i < SRP_SQ_SIZE; ++i) @@ -678,8 +694,6 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re static int srp_reconnect_target(struct srp_target_port *target) { struct Scsi_Host *shost = target->scsi_host; - struct ib_qp_attr qp_attr; - struct ib_wc wc; int i, ret; if (target->state != SRP_TARGET_LIVE) @@ -696,20 +710,10 @@ static int srp_reconnect_target(struct srp_target_port *target) if (ret) goto unblock; - qp_attr.qp_state = IB_QPS_RESET; - ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); - if (ret) - goto unblock; - - ret = srp_init_qp(target, target->qp); + ret = srp_create_target_ib(target); if (ret) goto unblock; - while (ib_poll_cq(target->recv_cq, 1, &wc) > 0) - ; /* nothing */ - while (ib_poll_cq(target->send_cq, 1, &wc) > 0) - ; /* nothing */ - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { struct srp_request *req = &target->req_ring[i]; if (req->scmnd) From 55d93898a14528a5d4199ba572bea1f0bec5870e Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Mon, 26 Nov 2012 11:57:47 +0100 Subject: [PATCH 25/33] IB/srp: send disconnect request without waiting for CM timewait exit Now that SRP recreates the CM ID, QP, and CQ for each connection, there is no need to wait for the timewait state to complete. Signed-off-by: Vu Pham Signed-off-by: David Dillow Signed-off-by: Bart Van Assche Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 85771eba9c80e..84d9298ed9893 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -481,12 +481,9 @@ static void srp_disconnect_target(struct srp_target_port *target) if (srp_change_conn_state(target, false)) { /* XXX should send SRP_I_LOGOUT request */ - init_completion(&target->done); if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { shost_printk(KERN_DEBUG, target->scsi_host, PFX "Sending CM DREQ failed\n"); - } else { - wait_for_completion(&target->done); } } } @@ -1664,7 +1661,6 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) shost_printk(KERN_ERR, target->scsi_host, PFX "connection closed\n"); - comp = 1; target->status = 0; break; From a4605a93696ee0768e55e4bce1ff7f0ee39bcf79 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 9 Oct 2011 14:09:07 +0200 Subject: [PATCH 26/33] IB/srp: Document sysfs attributes Document the sysfs attributes of the SRP initiator (ib_srp) according to the rules specified in Documentation/ABI/README. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-driver-ib_srp | 156 +++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 Documentation/ABI/stable/sysfs-driver-ib_srp diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp new file mode 100644 index 0000000000000..481aae95c7d19 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-driver-ib_srp @@ -0,0 +1,156 @@ +What: /sys/class/infiniband_srp/srp--/add_target +Date: January 2, 2006 +KernelVersion: 2.6.15 +Contact: linux-rdma@vger.kernel.org +Description: Interface for making ib_srp connect to a new target. + One can request ib_srp to connect to a new target by writing + a comma-separated list of login parameters to this sysfs + attribute. The supported parameters are: + * id_ext, a 16-digit hexadecimal number specifying the eight + byte identifier extension in the 16-byte SRP target port + identifier. The target port identifier is sent by ib_srp + to the target in the SRP_LOGIN_REQ request. + * ioc_guid, a 16-digit hexadecimal number specifying the eight + byte I/O controller GUID portion of the 16-byte target port + identifier. + * dgid, a 32-digit hexadecimal number specifying the + destination GID. + * pkey, a four-digit hexadecimal number specifying the + InfiniBand partition key. + * service_id, a 16-digit hexadecimal number specifying the + InfiniBand service ID used to establish communication with + the SRP target. How to find out the value of the service ID + is specified in the documentation of the SRP target. + * max_sect, a decimal number specifying the maximum number of + 512-byte sectors to be transferred via a single SCSI command. + * max_cmd_per_lun, a decimal number specifying the maximum + number of outstanding commands for a single LUN. + * io_class, a hexadecimal number specifying the SRP I/O class. + Must be either 0xff00 (rev 10) or 0x0100 (rev 16a). The I/O + class defines the format of the SRP initiator and target + port identifiers. + * initiator_ext, a 16-digit hexadecimal number specifying the + identifier extension portion of the SRP initiator port + identifier. This data is sent by the initiator to the target + in the SRP_LOGIN_REQ request. + * cmd_sg_entries, a number in the range 1..255 that specifies + the maximum number of data buffer descriptors stored in the + SRP_CMD information unit itself. With allow_ext_sg=0 the + parameter cmd_sg_entries defines the maximum S/G list length + for a single SRP_CMD, and commands whose S/G list length + exceeds this limit after S/G list collapsing will fail. + * allow_ext_sg, whether ib_srp is allowed to include a partial + memory descriptor list in an SRP_CMD instead of the entire + list. If a partial memory descriptor list has been included + in an SRP_CMD the remaining memory descriptors are + communicated from initiator to target via an additional RDMA + transfer. Setting allow_ext_sg to 1 increases the maximum + amount of data that can be transferred between initiator and + target via a single SCSI command. Since not all SRP target + implementations support partial memory descriptor lists the + default value for this option is 0. + * sg_tablesize, a number in the range 1..2048 specifying the + maximum S/G list length the SCSI layer is allowed to pass to + ib_srp. Specifying a value that exceeds cmd_sg_entries is + only safe with partial memory descriptor list support enabled + (allow_ext_sg=1). + +What: /sys/class/infiniband_srp/srp--/ibdev +Date: January 2, 2006 +KernelVersion: 2.6.15 +Contact: linux-rdma@vger.kernel.org +Description: HCA name (). + +What: /sys/class/infiniband_srp/srp--/port +Date: January 2, 2006 +KernelVersion: 2.6.15 +Contact: linux-rdma@vger.kernel.org +Description: HCA port number (). + +What: /sys/class/scsi_host/host/allow_ext_sg +Date: May 19, 2011 +KernelVersion: 2.6.39 +Contact: linux-rdma@vger.kernel.org +Description: Whether ib_srp is allowed to include a partial memory + descriptor list in an SRP_CMD when communicating with an SRP + target. + +What: /sys/class/scsi_host/host/cmd_sg_entries +Date: May 19, 2011 +KernelVersion: 2.6.39 +Contact: linux-rdma@vger.kernel.org +Description: Maximum number of data buffer descriptors that may be sent to + the target in a single SRP_CMD request. + +What: /sys/class/scsi_host/host/dgid +Date: June 17, 2006 +KernelVersion: 2.6.17 +Contact: linux-rdma@vger.kernel.org +Description: InfiniBand destination GID used for communication with the SRP + target. Differs from orig_dgid if port redirection has happened. + +What: /sys/class/scsi_host/host/id_ext +Date: June 17, 2006 +KernelVersion: 2.6.17 +Contact: linux-rdma@vger.kernel.org +Description: Eight-byte identifier extension portion of the 16-byte target + port identifier. + +What: /sys/class/scsi_host/host/ioc_guid +Date: June 17, 2006 +KernelVersion: 2.6.17 +Contact: linux-rdma@vger.kernel.org +Description: Eight-byte I/O controller GUID portion of the 16-byte target + port identifier. + +What: /sys/class/scsi_host/host/local_ib_device +Date: November 29, 2006 +KernelVersion: 2.6.19 +Contact: linux-rdma@vger.kernel.org +Description: Name of the InfiniBand HCA used for communicating with the + SRP target. + +What: /sys/class/scsi_host/host/local_ib_port +Date: November 29, 2006 +KernelVersion: 2.6.19 +Contact: linux-rdma@vger.kernel.org +Description: Number of the HCA port used for communicating with the + SRP target. + +What: /sys/class/scsi_host/host/orig_dgid +Date: June 17, 2006 +KernelVersion: 2.6.17 +Contact: linux-rdma@vger.kernel.org +Description: InfiniBand destination GID specified in the parameters + written to the add_target sysfs attribute. + +What: /sys/class/scsi_host/host/pkey +Date: June 17, 2006 +KernelVersion: 2.6.17 +Contact: linux-rdma@vger.kernel.org +Description: A 16-bit number representing the InfiniBand partition key used + for communication with the SRP target. + +What: /sys/class/scsi_host/host/req_lim +Date: October 20, 2010 +KernelVersion: 2.6.36 +Contact: linux-rdma@vger.kernel.org +Description: Number of requests ib_srp can send to the target before it has + to wait for more credits. For more information see also the + SRP credit algorithm in the SRP specification. + +What: /sys/class/scsi_host/host/service_id +Date: June 17, 2006 +KernelVersion: 2.6.17 +Contact: linux-rdma@vger.kernel.org +Description: InfiniBand service ID used for establishing communication with + the SRP target. + +What: /sys/class/scsi_host/host/zero_req_lim +Date: September 20, 2006 +KernelVersion: 2.6.18 +Contact: linux-rdma@vger.kernel.org +Description: Number of times the initiator had to wait before sending a + request to the target because it ran out of credits. For more + information see also the SRP credit algorithm in the SRP + specification. From ac9be30e91cffe07f762bd91ca40de065bf257d7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 21 Oct 2011 19:31:07 +0200 Subject: [PATCH 27/33] srp_transport: Fix attribute registration Register transport attributes after the attribute array has been set up instead of before. The current code can trigger a race condition because the code reading the attribute array can run on another thread than the code that initialized that array. Make sure that any code reading the attribute array will see all values written into that array. Signed-off-by: Bart Van Assche Cc: FUJITA Tomonori Cc: Robert Jennings Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/scsi/scsi_transport_srp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index 21a045e0559ff..07c4394624f1f 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -324,13 +324,14 @@ srp_attach_transport(struct srp_function_template *ft) i->rport_attr_cont.ac.attrs = &i->rport_attrs[0]; i->rport_attr_cont.ac.class = &srp_rport_class.class; i->rport_attr_cont.ac.match = srp_rport_match; - transport_container_register(&i->rport_attr_cont); count = 0; SETUP_RPORT_ATTRIBUTE_RD(port_id); SETUP_RPORT_ATTRIBUTE_RD(roles); i->rport_attrs[count] = NULL; + transport_container_register(&i->rport_attr_cont); + i->f = ft; return &i->t; From 9134a855178f141b9be48aa19b49ed92d82665ce Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 10 Sep 2011 16:31:57 +0200 Subject: [PATCH 28/33] srp_transport: Simplify attribute initialization code Eliminate the private_rport_attrs[] array and the SETUP_*() macros used to set up that array since the information in that array duplicates the information in the static device attributes. Also, verify whether SRP_RPORT_ATTRS is large enough since it is easy to forget to update that macro when adding new attributes. Signed-off-by: Bart Van Assche Cc: FUJITA Tomonori Cc: Robert Jennings Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/scsi/scsi_transport_srp.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index 07c4394624f1f..0d85f797141f9 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -47,7 +47,6 @@ struct srp_internal { struct device_attribute *host_attrs[SRP_HOST_ATTRS + 1]; struct device_attribute *rport_attrs[SRP_RPORT_ATTRS + 1]; - struct device_attribute private_rport_attrs[SRP_RPORT_ATTRS]; struct transport_container rport_attr_cont; }; @@ -72,24 +71,6 @@ static DECLARE_TRANSPORT_CLASS(srp_host_class, "srp_host", srp_host_setup, static DECLARE_TRANSPORT_CLASS(srp_rport_class, "srp_remote_ports", NULL, NULL, NULL); -#define SETUP_TEMPLATE(attrb, field, perm, test, ro_test, ro_perm) \ - i->private_##attrb[count] = dev_attr_##field; \ - i->private_##attrb[count].attr.mode = perm; \ - if (ro_test) { \ - i->private_##attrb[count].attr.mode = ro_perm; \ - i->private_##attrb[count].store = NULL; \ - } \ - i->attrb[count] = &i->private_##attrb[count]; \ - if (test) \ - count++ - -#define SETUP_RPORT_ATTRIBUTE_RD(field) \ - SETUP_TEMPLATE(rport_attrs, field, S_IRUGO, 1, 0, 0) - -#define SETUP_RPORT_ATTRIBUTE_RW(field) \ - SETUP_TEMPLATE(rport_attrs, field, S_IRUGO | S_IWUSR, \ - 1, 1, S_IRUGO) - #define SRP_PID(p) \ (p)->port_id[0], (p)->port_id[1], (p)->port_id[2], (p)->port_id[3], \ (p)->port_id[4], (p)->port_id[5], (p)->port_id[6], (p)->port_id[7], \ @@ -326,9 +307,10 @@ srp_attach_transport(struct srp_function_template *ft) i->rport_attr_cont.ac.match = srp_rport_match; count = 0; - SETUP_RPORT_ATTRIBUTE_RD(port_id); - SETUP_RPORT_ATTRIBUTE_RD(roles); - i->rport_attrs[count] = NULL; + i->rport_attrs[count++] = &dev_attr_port_id; + i->rport_attrs[count++] = &dev_attr_roles; + i->rport_attrs[count++] = NULL; + BUG_ON(count > ARRAY_SIZE(i->rport_attrs)); transport_container_register(&i->rport_attr_cont); From 7e1265bfe75f0ef1a9f5cfde202df68b7e35a53f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 9 Oct 2011 14:35:48 +0200 Subject: [PATCH 29/33] srp_transport: Document sysfs attributes Signed-off-by: Bart Van Assche Cc: FUJITA Tomonori Cc: Robert Jennings Acked-by: David Dillow Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-transport-srp | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 Documentation/ABI/stable/sysfs-transport-srp diff --git a/Documentation/ABI/stable/sysfs-transport-srp b/Documentation/ABI/stable/sysfs-transport-srp new file mode 100644 index 0000000000000..7b0d4a5350f16 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-transport-srp @@ -0,0 +1,12 @@ +What: /sys/class/srp_remote_ports/port-:/port_id +Date: June 27, 2007 +KernelVersion: 2.6.24 +Contact: linux-scsi@vger.kernel.org +Description: 16-byte local SRP port identifier in hexadecimal format. An + example: 4c:49:4e:55:58:20:56:49:4f:00:00:00:00:00:00:00. + +What: /sys/class/srp_remote_ports/port-:/roles +Date: June 27, 2007 +KernelVersion: 2.6.24 +Contact: linux-scsi@vger.kernel.org +Description: Role of the remote port. Either "SRP Initiator" or "SRP Target". From dc1bdbd9b8a077018d82230bc378f1bcfd8adba8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 16 Sep 2011 20:41:13 +0200 Subject: [PATCH 30/33] IB/srp: Allow SRP disconnect through sysfs Make it possible to disconnect the IB RC connection used by the SRP protocol to communicate with a target. Have the SRP transport layer create a sysfs "delete" attribute for initiator drivers that support this functionality. Signed-off-by: Bart Van Assche Acked-by: David Dillow Cc: FUJITA Tomonori Cc: Robert Jennings Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-transport-srp | 7 +++++++ drivers/infiniband/ulp/srp/ib_srp.c | 10 +++++++++ drivers/scsi/scsi_transport_srp.c | 22 +++++++++++++++++++- include/scsi/scsi_transport_srp.h | 8 +++++++ 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/stable/sysfs-transport-srp b/Documentation/ABI/stable/sysfs-transport-srp index 7b0d4a5350f16..b36fb0dc13c8e 100644 --- a/Documentation/ABI/stable/sysfs-transport-srp +++ b/Documentation/ABI/stable/sysfs-transport-srp @@ -1,3 +1,10 @@ +What: /sys/class/srp_remote_ports/port-:/delete +Date: June 1, 2012 +KernelVersion: 3.7 +Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org +Description: Instructs an SRP initiator to disconnect from a target and to + remove all LUNs imported from that target. + What: /sys/class/srp_remote_ports/port-:/port_id Date: June 27, 2007 KernelVersion: 2.6.24 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 84d9298ed9893..d5088ce78290a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -549,6 +549,13 @@ static void srp_remove_work(struct work_struct *work) srp_remove_target(target); } +static void srp_rport_delete(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + + srp_queue_remove_work(target); +} + static int srp_connect_target(struct srp_target_port *target) { int retries = 3; @@ -1958,6 +1965,8 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) return PTR_ERR(rport); } + rport->lld_data = target; + spin_lock(&host->target_lock); list_add_tail(&target->list, &host->target_list); spin_unlock(&host->target_lock); @@ -2524,6 +2533,7 @@ static void srp_remove_one(struct ib_device *device) } static struct srp_function_template ib_srp_transport_functions = { + .rport_delete = srp_rport_delete, }; static int __init srp_init_module(void) diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index 0d85f797141f9..f379c7f3034cd 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -38,7 +38,7 @@ struct srp_host_attrs { #define to_srp_host_attrs(host) ((struct srp_host_attrs *)(host)->shost_data) #define SRP_HOST_ATTRS 0 -#define SRP_RPORT_ATTRS 2 +#define SRP_RPORT_ATTRS 3 struct srp_internal { struct scsi_transport_template t; @@ -116,6 +116,24 @@ show_srp_rport_roles(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(roles, S_IRUGO, show_srp_rport_roles, NULL); +static ssize_t store_srp_rport_delete(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + struct Scsi_Host *shost = dev_to_shost(dev); + struct srp_internal *i = to_srp_internal(shost->transportt); + + if (i->f->rport_delete) { + i->f->rport_delete(rport); + return count; + } else { + return -ENOSYS; + } +} + +static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete); + static void srp_rport_release(struct device *dev) { struct srp_rport *rport = dev_to_rport(dev); @@ -309,6 +327,8 @@ srp_attach_transport(struct srp_function_template *ft) count = 0; i->rport_attrs[count++] = &dev_attr_port_id; i->rport_attrs[count++] = &dev_attr_roles; + if (ft->rport_delete) + i->rport_attrs[count++] = &dev_attr_delete; i->rport_attrs[count++] = NULL; BUG_ON(count > ARRAY_SIZE(i->rport_attrs)); diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h index 9c60ca1c08c5a..ff0f04ac91aad 100644 --- a/include/scsi/scsi_transport_srp.h +++ b/include/scsi/scsi_transport_srp.h @@ -14,13 +14,21 @@ struct srp_rport_identifiers { }; struct srp_rport { + /* for initiator and target drivers */ + struct device dev; u8 port_id[16]; u8 roles; + + /* for initiator drivers */ + + void *lld_data; /* LLD private data */ }; struct srp_function_template { + /* for initiator drivers */ + void (*rport_delete)(struct srp_rport *rport); /* for target drivers */ int (* tsk_mgmt_response)(struct Scsi_Host *, u64, u64, int); int (* it_nexus_response)(struct Scsi_Host *, u64, int); From 00ad255d17c2d12a035370836cb93630711d48ca Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Thu, 6 Dec 2012 19:56:31 +0000 Subject: [PATCH 31/33] RDMA/nes: Fix for BUG_ON due to adding already-pending timer To avoid nes tcp_timer crash for SMP architectures, add_timer is replaced with mod_timer. Signed-off-by: Tatyana Nikolova Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index feb41e74206b2..22ea67eea5dce 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -669,7 +669,6 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, struct nes_cm_core *cm_core = cm_node->cm_core; struct nes_timer_entry *new_send; int ret = 0; - u32 was_timer_set; new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC); if (!new_send) @@ -721,12 +720,8 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, } } - was_timer_set = timer_pending(&cm_core->tcp_timer); - - if (!was_timer_set) { - cm_core->tcp_timer.expires = new_send->timetosend; - add_timer(&cm_core->tcp_timer); - } + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, new_send->timetosend); return ret; } @@ -944,10 +939,8 @@ static void nes_cm_timer_tick(unsigned long pass) } if (settimer) { - if (!timer_pending(&cm_core->tcp_timer)) { - cm_core->tcp_timer.expires = nexttimeout; - add_timer(&cm_core->tcp_timer); - } + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, nexttimeout); } } @@ -1312,8 +1305,6 @@ static int mini_cm_del_listen(struct nes_cm_core *cm_core, static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) { - u32 was_timer_set; - cm_node->accelerated = 1; if (cm_node->accept_pend) { @@ -1323,11 +1314,8 @@ static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); } - was_timer_set = timer_pending(&cm_core->tcp_timer); - if (!was_timer_set) { - cm_core->tcp_timer.expires = jiffies + NES_SHORT_TIME; - add_timer(&cm_core->tcp_timer); - } + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME)); return 0; } From 7bfcfa51c35cdd2d37e0d70fc11790642dd11fb3 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Thu, 6 Dec 2012 19:58:27 +0000 Subject: [PATCH 32/33] RDMA/nes: Fix for terminate timer crash The terminate timer needs to be initialized just once. Signed-off-by: Tatyana Nikolova Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes.h | 1 + drivers/infiniband/hw/nes/nes_hw.c | 9 ++------- drivers/infiniband/hw/nes/nes_verbs.c | 4 +++- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 5cac29e6bc1c7..33cc58941a3ea 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -532,6 +532,7 @@ void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *); int nes_destroy_cqp(struct nes_device *); int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); void nes_recheck_link_status(struct work_struct *work); +void nes_terminate_timeout(unsigned long context); /* nes_nic.c */ struct net_device *nes_netdev_init(struct nes_device *, void __iomem *); diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index fe7965ee40968..67647e264611e 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -75,7 +75,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, static void process_critical_error(struct nes_device *nesdev); static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number); static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode); -static void nes_terminate_timeout(unsigned long context); static void nes_terminate_start_timer(struct nes_qp *nesqp); #ifdef CONFIG_INFINIBAND_NES_DEBUG @@ -3520,7 +3519,7 @@ static void nes_terminate_received(struct nes_device *nesdev, } /* Timeout routine in case terminate fails to complete */ -static void nes_terminate_timeout(unsigned long context) +void nes_terminate_timeout(unsigned long context) { struct nes_qp *nesqp = (struct nes_qp *)(unsigned long)context; @@ -3530,11 +3529,7 @@ static void nes_terminate_timeout(unsigned long context) /* Set a timer in case hw cannot complete the terminate sequence */ static void nes_terminate_start_timer(struct nes_qp *nesqp) { - init_timer(&nesqp->terminate_timer); - nesqp->terminate_timer.function = nes_terminate_timeout; - nesqp->terminate_timer.expires = jiffies + HZ; - nesqp->terminate_timer.data = (unsigned long)nesqp; - add_timer(&nesqp->terminate_timer); + mod_timer(&nesqp->terminate_timer, (jiffies + HZ)); } /** diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index cd0ecb215cca0..c47ec2599bc9b 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1404,6 +1404,9 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, } nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); + init_timer(&nesqp->terminate_timer); + nesqp->terminate_timer.function = nes_terminate_timeout; + nesqp->terminate_timer.data = (unsigned long)nesqp; /* update the QP table */ nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; @@ -1413,7 +1416,6 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, return &nesqp->ibqp; } - /** * nes_clean_cq */ From 7d9c199a55200c9b9fcad08e150470d02fb385be Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Thu, 6 Dec 2012 20:05:02 +0000 Subject: [PATCH 33/33] RDMA/nes: Fix for crash when registering zero length MR for CQ Signed-off-by: Tatyana Nikolova Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index c47ec2599bc9b..07e4fbad987ab 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2561,6 +2561,11 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ibmr; case IWNES_MEMREG_TYPE_QP: case IWNES_MEMREG_TYPE_CQ: + if (!region->length) { + nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n"); + ib_umem_release(region); + return ERR_PTR(-EINVAL); + } nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL); if (!nespbl) { nes_debug(NES_DBG_MR, "Unable to allocate PBL\n");