From 290c4a902b79246ec55e477fc313f27f98393dee Mon Sep 17 00:00:00 2001
From: Bob Pearson <rpearsonhpe@gmail.com>
Date: Sun, 10 Apr 2022 22:06:48 -0500
Subject: [PATCH 1/9] RDMA/rxe: Fix "Replace mr by rkey in responder resources"

The referenced commit generates a reference counting error if the rkey has
the same index but the wrong key. In this case the reference taken by
rxe_pool_get_index() is not dropped.

Drop the reference if the keys don't match in rxe_recheck_mr().  Check
that the mw and mr are still valid.

Fixes: 8a1a0be894da ("RDMA/rxe: Replace mr by rkey in responder resources")
Link: https://lore.kernel.org/r/20220411030647.20011-1-rpearsonhpe@gmail.com
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/sw/rxe/rxe_resp.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index 16fc7ea1298d8..1d95fab606da3 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -680,6 +680,11 @@ static struct resp_res *rxe_prepare_read_res(struct rxe_qp *qp,
  * It is assumed that the access permissions if originally good
  * are OK and the mappings to be unchanged.
  *
+ * TODO: If someone reregisters an MR to change its size or
+ * access permissions during the processing of an RDMA read
+ * we should kill the responder resource and complete the
+ * operation with an error.
+ *
  * Return: mr on success else NULL
  */
 static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey)
@@ -690,23 +695,27 @@ static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey)
 
 	if (rkey_is_mw(rkey)) {
 		mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8);
-		if (!mw || mw->rkey != rkey)
+		if (!mw)
 			return NULL;
 
-		if (mw->state != RXE_MW_STATE_VALID) {
+		mr = mw->mr;
+		if (mw->rkey != rkey || mw->state != RXE_MW_STATE_VALID ||
+		    !mr || mr->state != RXE_MR_STATE_VALID) {
 			rxe_put(mw);
 			return NULL;
 		}
 
-		mr = mw->mr;
+		rxe_get(mr);
 		rxe_put(mw);
-	} else {
-		mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8);
-		if (!mr || mr->rkey != rkey)
-			return NULL;
+
+		return mr;
 	}
 
-	if (mr->state != RXE_MR_STATE_VALID) {
+	mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8);
+	if (!mr)
+		return NULL;
+
+	if (mr->rkey != rkey || mr->state != RXE_MR_STATE_VALID) {
 		rxe_put(mr);
 		return NULL;
 	}

From 679ab61bf5f5f519377d812afb4fb93634782c74 Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Mon, 18 Apr 2022 23:33:22 +0800
Subject: [PATCH 2/9] RDMA/irdma: Fix deadlock in irdma_cleanup_cm_core()

There is a deadlock in irdma_cleanup_cm_core(), which is shown below:

   (Thread 1)              |      (Thread 2)
                           | irdma_schedule_cm_timer()
irdma_cleanup_cm_core()    |  add_timer()
 spin_lock_irqsave() //(1) |  (wait a time)
 ...                       | irdma_cm_timer_tick()
 del_timer_sync()          |  spin_lock_irqsave() //(2)
 (wait timer to stop)      |  ...

We hold cm_core->ht_lock in position (1) of thread 1 and use
del_timer_sync() to wait timer to stop, but timer handler also need
cm_core->ht_lock in position (2) of thread 2.  As a result,
irdma_cleanup_cm_core() will block forever.

This patch removes the check of timer_pending() in
irdma_cleanup_cm_core(), because the del_timer_sync() function will just
return directly if there isn't a pending timer. As a result, the lock is
redundant, because there is no resource it could protect.

Link: https://lore.kernel.org/r/20220418153322.42524-1-duoming@zju.edu.cn
Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/irdma/cm.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c
index dedb3b7edd8d6..a98d962e5efb6 100644
--- a/drivers/infiniband/hw/irdma/cm.c
+++ b/drivers/infiniband/hw/irdma/cm.c
@@ -3246,15 +3246,10 @@ int irdma_setup_cm_core(struct irdma_device *iwdev, u8 rdma_ver)
  */
 void irdma_cleanup_cm_core(struct irdma_cm_core *cm_core)
 {
-	unsigned long flags;
-
 	if (!cm_core)
 		return;
 
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	if (timer_pending(&cm_core->tcp_timer))
-		del_timer_sync(&cm_core->tcp_timer);
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+	del_timer_sync(&cm_core->tcp_timer);
 
 	destroy_workqueue(cm_core->event_wq);
 	cm_core->dev->ws_reset(&cm_core->iwdev->vsi);

From 570a4bf7440e9fb2a4164244a6bf60a46362b627 Mon Sep 17 00:00:00 2001
From: Bob Pearson <rpearsonhpe@gmail.com>
Date: Mon, 18 Apr 2022 12:41:04 -0500
Subject: [PATCH 3/9] RDMA/rxe: Recheck the MR in when generating a READ reply

The rping benchmark fails on long runs. The root cause of this failure has
been traced to a failure to compute a nonzero value of mr in rare
situations.

Fix this failure by correctly handling the computation of mr in
read_reply() in rxe_resp.c in the replay flow.

Fixes: 8a1a0be894da ("RDMA/rxe: Replace mr by rkey in responder resources")
Link: https://lore.kernel.org/r/20220418174103.3040-1-rpearsonhpe@gmail.com
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/sw/rxe/rxe_resp.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index 1d95fab606da3..9cd0eaff98dea 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -745,8 +745,14 @@ static enum resp_states read_reply(struct rxe_qp *qp,
 	}
 
 	if (res->state == rdatm_res_state_new) {
-		mr = qp->resp.mr;
-		qp->resp.mr = NULL;
+		if (!res->replay) {
+			mr = qp->resp.mr;
+			qp->resp.mr = NULL;
+		} else {
+			mr = rxe_recheck_mr(qp, res->read.rkey);
+			if (!mr)
+				return RESPST_ERR_RKEY_VIOLATION;
+		}
 
 		if (res->read.resid <= mtu)
 			opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;

From 7b8943b821bafab492f43aafbd006b57c6b65845 Mon Sep 17 00:00:00 2001
From: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Date: Mon, 25 Apr 2022 13:17:01 -0500
Subject: [PATCH 4/9] RDMA/irdma: Flush iWARP QP if modified to ERR from RTR
 state

When connection establishment fails in iWARP mode, an app can drain the
QPs and hang because flush isn't issued when the QP is modified from RTR
state to error. Issue a flush in this case using function
irdma_cm_disconn().

Update irdma_cm_disconn() to do flush when cm_id is NULL, which is the
case when the QP is in RTR state and there is an error in the connection
establishment.

Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs")
Link: https://lore.kernel.org/r/20220425181703.1634-2-shiraz.saleem@intel.com
Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/irdma/cm.c    | 16 +++++-----------
 drivers/infiniband/hw/irdma/verbs.c |  4 ++--
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c
index a98d962e5efb6..90b4113e7071a 100644
--- a/drivers/infiniband/hw/irdma/cm.c
+++ b/drivers/infiniband/hw/irdma/cm.c
@@ -3462,12 +3462,6 @@ static void irdma_cm_disconn_true(struct irdma_qp *iwqp)
 	}
 
 	cm_id = iwqp->cm_id;
-	/* make sure we havent already closed this connection */
-	if (!cm_id) {
-		spin_unlock_irqrestore(&iwqp->lock, flags);
-		return;
-	}
-
 	original_hw_tcp_state = iwqp->hw_tcp_state;
 	original_ibqp_state = iwqp->ibqp_state;
 	last_ae = iwqp->last_aeq;
@@ -3489,11 +3483,11 @@ static void irdma_cm_disconn_true(struct irdma_qp *iwqp)
 			disconn_status = -ECONNRESET;
 	}
 
-	if ((original_hw_tcp_state == IRDMA_TCP_STATE_CLOSED ||
-	     original_hw_tcp_state == IRDMA_TCP_STATE_TIME_WAIT ||
-	     last_ae == IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE ||
-	     last_ae == IRDMA_AE_BAD_CLOSE ||
-	     last_ae == IRDMA_AE_LLP_CONNECTION_RESET || iwdev->rf->reset)) {
+	if (original_hw_tcp_state == IRDMA_TCP_STATE_CLOSED ||
+	    original_hw_tcp_state == IRDMA_TCP_STATE_TIME_WAIT ||
+	    last_ae == IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE ||
+	    last_ae == IRDMA_AE_BAD_CLOSE ||
+	    last_ae == IRDMA_AE_LLP_CONNECTION_RESET || iwdev->rf->reset || !cm_id) {
 		issue_close = 1;
 		iwqp->cm_id = NULL;
 		qp->term_flags = 0;
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index 46f475394af5f..52f3e88f85695 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -1618,13 +1618,13 @@ int irdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
 
 	if (issue_modify_qp && iwqp->ibqp_state > IB_QPS_RTS) {
 		if (dont_wait) {
-			if (iwqp->cm_id && iwqp->hw_tcp_state) {
+			if (iwqp->hw_tcp_state) {
 				spin_lock_irqsave(&iwqp->lock, flags);
 				iwqp->hw_tcp_state = IRDMA_TCP_STATE_CLOSED;
 				iwqp->last_aeq = IRDMA_AE_RESET_SENT;
 				spin_unlock_irqrestore(&iwqp->lock, flags);
-				irdma_cm_disconn(iwqp);
 			}
+			irdma_cm_disconn(iwqp);
 		} else {
 			int close_timer_started;
 

From 2df6d895907b2f5dfbc558cbff7801bba82cb3cc Mon Sep 17 00:00:00 2001
From: Shiraz Saleem <shiraz.saleem@intel.com>
Date: Mon, 25 Apr 2022 13:17:02 -0500
Subject: [PATCH 5/9] RDMA/irdma: Reduce iWARP QP destroy time

QP destroy is synchronous and waits for its refcnt to be decremented in
irdma_cm_node_free_cb (for iWARP) which fires after the RCU grace period
elapses.

Applications running a large number of connections are exposed to high
wait times on destroy QP for events like SIGABORT.

The long pole for this wait time is the firing of the call_rcu callback
during a CM node destroy which can be slow. It holds the QP reference
count and blocks the destroy QP from completing.

call_rcu only needs to make sure that list walkers have a reference to the
cm_node object before freeing it and thus need to wait for grace period
elapse. The rest of the connection teardown in irdma_cm_node_free_cb is
moved out of the grace period wait in irdma_destroy_connection. Also,
replace call_rcu with a simple kfree_rcu as it just needs to do a kfree on
the cm_node

Fixes: 146b9756f14c ("RDMA/irdma: Add connection manager")
Link: https://lore.kernel.org/r/20220425181703.1634-3-shiraz.saleem@intel.com
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/irdma/cm.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c
index 90b4113e7071a..638bf4a1ed946 100644
--- a/drivers/infiniband/hw/irdma/cm.c
+++ b/drivers/infiniband/hw/irdma/cm.c
@@ -2308,10 +2308,8 @@ irdma_make_cm_node(struct irdma_cm_core *cm_core, struct irdma_device *iwdev,
 	return NULL;
 }
 
-static void irdma_cm_node_free_cb(struct rcu_head *rcu_head)
+static void irdma_destroy_connection(struct irdma_cm_node *cm_node)
 {
-	struct irdma_cm_node *cm_node =
-			    container_of(rcu_head, struct irdma_cm_node, rcu_head);
 	struct irdma_cm_core *cm_core = cm_node->cm_core;
 	struct irdma_qp *iwqp;
 	struct irdma_cm_info nfo;
@@ -2359,7 +2357,6 @@ static void irdma_cm_node_free_cb(struct rcu_head *rcu_head)
 	}
 
 	cm_core->cm_free_ah(cm_node);
-	kfree(cm_node);
 }
 
 /**
@@ -2387,8 +2384,9 @@ void irdma_rem_ref_cm_node(struct irdma_cm_node *cm_node)
 
 	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
 
-	/* wait for all list walkers to exit their grace period */
-	call_rcu(&cm_node->rcu_head, irdma_cm_node_free_cb);
+	irdma_destroy_connection(cm_node);
+
+	kfree_rcu(cm_node, rcu_head);
 }
 
 /**

From 1c9043ae0667a43bd87beeebbdd4bed674713629 Mon Sep 17 00:00:00 2001
From: Mustafa Ismail <mustafa.ismail@intel.com>
Date: Mon, 25 Apr 2022 13:17:03 -0500
Subject: [PATCH 6/9] RDMA/irdma: Fix possible crash due to NULL netdev in
 notifier

For some net events in irdma_net_event notifier, the netdev can be NULL
which will cause a crash in rdma_vlan_dev_real_dev.  Fix this by moving
all processing to the NETEVENT_NEIGH_UPDATE case where the netdev is
guaranteed to not be NULL.

Fixes: 6702bc147448 ("RDMA/irdma: Fix netdev notifications for vlan's")
Link: https://lore.kernel.org/r/20220425181703.1634-4-shiraz.saleem@intel.com
Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/irdma/utils.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c
index 346c2c5dabdf0..81760415d66c8 100644
--- a/drivers/infiniband/hw/irdma/utils.c
+++ b/drivers/infiniband/hw/irdma/utils.c
@@ -258,18 +258,16 @@ int irdma_net_event(struct notifier_block *notifier, unsigned long event,
 	u32 local_ipaddr[4] = {};
 	bool ipv4 = true;
 
-	real_dev = rdma_vlan_dev_real_dev(netdev);
-	if (!real_dev)
-		real_dev = netdev;
-
-	ibdev = ib_device_get_by_netdev(real_dev, RDMA_DRIVER_IRDMA);
-	if (!ibdev)
-		return NOTIFY_DONE;
-
-	iwdev = to_iwdev(ibdev);
-
 	switch (event) {
 	case NETEVENT_NEIGH_UPDATE:
+		real_dev = rdma_vlan_dev_real_dev(netdev);
+		if (!real_dev)
+			real_dev = netdev;
+		ibdev = ib_device_get_by_netdev(real_dev, RDMA_DRIVER_IRDMA);
+		if (!ibdev)
+			return NOTIFY_DONE;
+
+		iwdev = to_iwdev(ibdev);
 		p = (__be32 *)neigh->primary_key;
 		if (neigh->tbl->family == AF_INET6) {
 			ipv4 = false;
@@ -290,13 +288,12 @@ int irdma_net_event(struct notifier_block *notifier, unsigned long event,
 			irdma_manage_arp_cache(iwdev->rf, neigh->ha,
 					       local_ipaddr, ipv4,
 					       IRDMA_ARP_DELETE);
+		ib_device_put(ibdev);
 		break;
 	default:
 		break;
 	}
 
-	ib_device_put(ibdev);
-
 	return NOTIFY_DONE;
 }
 

From ef91271c65c12d36e4c2b61c61d4849fb6d11aa0 Mon Sep 17 00:00:00 2001
From: Cheng Xu <chengyou@linux.alibaba.com>
Date: Sun, 24 Apr 2022 16:01:03 +0800
Subject: [PATCH 7/9] RDMA/siw: Fix a condition race issue in MPA request
 processing

The calling of siw_cm_upcall and detaching new_cep with its listen_cep
should be atomistic semantics. Otherwise siw_reject may be called in a
temporary state, e,g, siw_cm_upcall is called but the new_cep->listen_cep
has not being cleared.

This fixes a WARN:

  WARNING: CPU: 7 PID: 201 at drivers/infiniband/sw/siw/siw_cm.c:255 siw_cep_put+0x125/0x130 [siw]
  CPU: 2 PID: 201 Comm: kworker/u16:22 Kdump: loaded Tainted: G            E     5.17.0-rc7 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
  Workqueue: iw_cm_wq cm_work_handler [iw_cm]
  RIP: 0010:siw_cep_put+0x125/0x130 [siw]
  Call Trace:
   <TASK>
   siw_reject+0xac/0x180 [siw]
   iw_cm_reject+0x68/0xc0 [iw_cm]
   cm_work_handler+0x59d/0xe20 [iw_cm]
   process_one_work+0x1e2/0x3b0
   worker_thread+0x50/0x3a0
   ? rescuer_thread+0x390/0x390
   kthread+0xe5/0x110
   ? kthread_complete_and_exit+0x20/0x20
   ret_from_fork+0x1f/0x30
   </TASK>

Fixes: 6c52fdc244b5 ("rdma/siw: connection management")
Link: https://lore.kernel.org/r/d528d83466c44687f3872eadcb8c184528b2e2d4.1650526554.git.chengyou@linux.alibaba.com
Reported-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Bernard Metzler <bmt@zurich.ibm.com>
Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/sw/siw/siw_cm.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index 7acdd3c3a599d..17f34d584cd9e 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -968,14 +968,15 @@ static void siw_accept_newconn(struct siw_cep *cep)
 
 		siw_cep_set_inuse(new_cep);
 		rv = siw_proc_mpareq(new_cep);
-		siw_cep_set_free(new_cep);
-
 		if (rv != -EAGAIN) {
 			siw_cep_put(cep);
 			new_cep->listen_cep = NULL;
-			if (rv)
+			if (rv) {
+				siw_cep_set_free(new_cep);
 				goto error;
+			}
 		}
+		siw_cep_set_free(new_cep);
 	}
 	return;
 

From a926a903b7dc39a8a949150258c09290998dd812 Mon Sep 17 00:00:00 2001
From: Bob Pearson <rpearsonhpe@gmail.com>
Date: Wed, 4 May 2022 15:28:17 -0500
Subject: [PATCH 8/9] RDMA/rxe: Do not call  dev_mc_add/del() under a spinlock

These routines were not intended to be called under a spinlock and will
throw debugging warnings:

   raw_local_irq_restore() called with IRQs enabled
   WARNING: CPU: 13 PID: 3107 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x2f/0x50
   CPU: 13 PID: 3107 Comm: python3 Tainted: G            E     5.18.0-rc1+ #7
   Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
   RIP: 0010:warn_bogus_irq_restore+0x2f/0x50
   Call Trace:
    <TASK>
    _raw_spin_unlock_irqrestore+0x75/0x80
    rxe_attach_mcast+0x304/0x480 [rdma_rxe]
    ib_attach_mcast+0x88/0xa0 [ib_core]
    ib_uverbs_attach_mcast+0x186/0x1e0 [ib_uverbs]
    ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xcd/0x140 [ib_uverbs]
    ib_uverbs_cmd_verbs+0xdb0/0xea0 [ib_uverbs]
    ib_uverbs_ioctl+0xd2/0x160 [ib_uverbs]
    do_syscall_64+0x5c/0x80
    entry_SYSCALL_64_after_hwframe+0x44/0xae

Move them out of the spinlock, it is OK if there is some races setting up
the MC reception at the ethernet layer with rbtree lookups.

Fixes: 6090a0c4c7c6 ("RDMA/rxe: Cleanup rxe_mcast.c")
Link: https://lore.kernel.org/r/20220504202817.98247-1-rpearsonhpe@gmail.com
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/sw/rxe/rxe_mcast.c | 51 ++++++++++++---------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
index ae8f11cb704af..77e45cabd8ea0 100644
--- a/drivers/infiniband/sw/rxe/rxe_mcast.c
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -38,13 +38,13 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
 }
 
 /**
- * rxe_mcast_delete - delete multicast address from rxe device
+ * rxe_mcast_del - delete multicast address from rxe device
  * @rxe: rxe device object
  * @mgid: multicast address as a gid
  *
  * Returns 0 on success else an error
  */
-static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid)
+static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid)
 {
 	unsigned char ll_addr[ETH_ALEN];
 
@@ -159,17 +159,10 @@ struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
  * @mcg: new mcg object
  *
  * Context: caller should hold rxe->mcg lock
- * Returns: 0 on success else an error
  */
-static int __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid,
-			  struct rxe_mcg *mcg)
+static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid,
+			   struct rxe_mcg *mcg)
 {
-	int err;
-
-	err = rxe_mcast_add(rxe, mgid);
-	if (unlikely(err))
-		return err;
-
 	kref_init(&mcg->ref_cnt);
 	memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid));
 	INIT_LIST_HEAD(&mcg->qp_list);
@@ -184,8 +177,6 @@ static int __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid,
 	 */
 	kref_get(&mcg->ref_cnt);
 	__rxe_insert_mcg(mcg);
-
-	return 0;
 }
 
 /**
@@ -209,6 +200,12 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
 	if (mcg)
 		return mcg;
 
+	/* check to see if we have reached limit */
+	if (atomic_inc_return(&rxe->mcg_num) > rxe->attr.max_mcast_grp) {
+		err = -ENOMEM;
+		goto err_dec;
+	}
+
 	/* speculative alloc of new mcg */
 	mcg = kzalloc(sizeof(*mcg), GFP_KERNEL);
 	if (!mcg)
@@ -218,27 +215,23 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
 	/* re-check to see if someone else just added it */
 	tmp = __rxe_lookup_mcg(rxe, mgid);
 	if (tmp) {
+		spin_unlock_irqrestore(&rxe->mcg_lock, flags);
+		atomic_dec(&rxe->mcg_num);
 		kfree(mcg);
-		mcg = tmp;
-		goto out;
-	}
-
-	if (atomic_inc_return(&rxe->mcg_num) > rxe->attr.max_mcast_grp) {
-		err = -ENOMEM;
-		goto err_dec;
+		return tmp;
 	}
 
-	err = __rxe_init_mcg(rxe, mgid, mcg);
-	if (err)
-		goto err_dec;
-out:
+	__rxe_init_mcg(rxe, mgid, mcg);
 	spin_unlock_irqrestore(&rxe->mcg_lock, flags);
-	return mcg;
 
+	/* add mcast address outside of lock */
+	err = rxe_mcast_add(rxe, mgid);
+	if (!err)
+		return mcg;
+
+	kfree(mcg);
 err_dec:
 	atomic_dec(&rxe->mcg_num);
-	spin_unlock_irqrestore(&rxe->mcg_lock, flags);
-	kfree(mcg);
 	return ERR_PTR(err);
 }
 
@@ -268,7 +261,6 @@ static void __rxe_destroy_mcg(struct rxe_mcg *mcg)
 	__rxe_remove_mcg(mcg);
 	kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
 
-	rxe_mcast_delete(mcg->rxe, &mcg->mgid);
 	atomic_dec(&rxe->mcg_num);
 }
 
@@ -282,6 +274,9 @@ static void rxe_destroy_mcg(struct rxe_mcg *mcg)
 {
 	unsigned long flags;
 
+	/* delete mcast address outside of lock */
+	rxe_mcast_del(mcg->rxe, &mcg->mgid);
+
 	spin_lock_irqsave(&mcg->rxe->mcg_lock, flags);
 	__rxe_destroy_mcg(mcg);
 	spin_unlock_irqrestore(&mcg->rxe->mcg_lock, flags);

From bfdc0edd11f9501b891a069b5bbd3b16731941e1 Mon Sep 17 00:00:00 2001
From: Bob Pearson <rpearsonhpe@gmail.com>
Date: Wed, 4 May 2022 15:28:17 -0500
Subject: [PATCH 9/9] RDMA/rxe: Change mcg_lock to a _bh lock

rxe_mcast.c currently uses _irqsave spinlocks for rxe->mcg_lock while
rxe_recv.c uses _bh spinlocks for the same lock.

As there is no case where the mcg_lock can be taken from an IRQ, change
these all to bh locks so we don't have confusing mismatched lock types on
the same spinlock.

Fixes: 6090a0c4c7c6 ("RDMA/rxe: Cleanup rxe_mcast.c")
Link: https://lore.kernel.org/r/20220504202817.98247-1-rpearsonhpe@gmail.com
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/sw/rxe/rxe_mcast.c | 36 +++++++++++----------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
index 77e45cabd8ea0..873a9b10307c0 100644
--- a/drivers/infiniband/sw/rxe/rxe_mcast.c
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -143,11 +143,10 @@ static struct rxe_mcg *__rxe_lookup_mcg(struct rxe_dev *rxe,
 struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
 {
 	struct rxe_mcg *mcg;
-	unsigned long flags;
 
-	spin_lock_irqsave(&rxe->mcg_lock, flags);
+	spin_lock_bh(&rxe->mcg_lock);
 	mcg = __rxe_lookup_mcg(rxe, mgid);
-	spin_unlock_irqrestore(&rxe->mcg_lock, flags);
+	spin_unlock_bh(&rxe->mcg_lock);
 
 	return mcg;
 }
@@ -189,7 +188,6 @@ static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid,
 static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
 {
 	struct rxe_mcg *mcg, *tmp;
-	unsigned long flags;
 	int err;
 
 	if (rxe->attr.max_mcast_grp == 0)
@@ -211,18 +209,18 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
 	if (!mcg)
 		return ERR_PTR(-ENOMEM);
 
-	spin_lock_irqsave(&rxe->mcg_lock, flags);
+	spin_lock_bh(&rxe->mcg_lock);
 	/* re-check to see if someone else just added it */
 	tmp = __rxe_lookup_mcg(rxe, mgid);
 	if (tmp) {
-		spin_unlock_irqrestore(&rxe->mcg_lock, flags);
+		spin_unlock_bh(&rxe->mcg_lock);
 		atomic_dec(&rxe->mcg_num);
 		kfree(mcg);
 		return tmp;
 	}
 
 	__rxe_init_mcg(rxe, mgid, mcg);
-	spin_unlock_irqrestore(&rxe->mcg_lock, flags);
+	spin_unlock_bh(&rxe->mcg_lock);
 
 	/* add mcast address outside of lock */
 	err = rxe_mcast_add(rxe, mgid);
@@ -272,14 +270,12 @@ static void __rxe_destroy_mcg(struct rxe_mcg *mcg)
  */
 static void rxe_destroy_mcg(struct rxe_mcg *mcg)
 {
-	unsigned long flags;
-
 	/* delete mcast address outside of lock */
 	rxe_mcast_del(mcg->rxe, &mcg->mgid);
 
-	spin_lock_irqsave(&mcg->rxe->mcg_lock, flags);
+	spin_lock_bh(&mcg->rxe->mcg_lock);
 	__rxe_destroy_mcg(mcg);
-	spin_unlock_irqrestore(&mcg->rxe->mcg_lock, flags);
+	spin_unlock_bh(&mcg->rxe->mcg_lock);
 }
 
 /**
@@ -334,25 +330,24 @@ static int rxe_attach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp)
 {
 	struct rxe_dev *rxe = mcg->rxe;
 	struct rxe_mca *mca, *tmp;
-	unsigned long flags;
 	int err;
 
 	/* check to see if the qp is already a member of the group */
-	spin_lock_irqsave(&rxe->mcg_lock, flags);
+	spin_lock_bh(&rxe->mcg_lock);
 	list_for_each_entry(mca, &mcg->qp_list, qp_list) {
 		if (mca->qp == qp) {
-			spin_unlock_irqrestore(&rxe->mcg_lock, flags);
+			spin_unlock_bh(&rxe->mcg_lock);
 			return 0;
 		}
 	}
-	spin_unlock_irqrestore(&rxe->mcg_lock, flags);
+	spin_unlock_bh(&rxe->mcg_lock);
 
 	/* speculative alloc new mca without using GFP_ATOMIC */
 	mca = kzalloc(sizeof(*mca), GFP_KERNEL);
 	if (!mca)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&rxe->mcg_lock, flags);
+	spin_lock_bh(&rxe->mcg_lock);
 	/* re-check to see if someone else just attached qp */
 	list_for_each_entry(tmp, &mcg->qp_list, qp_list) {
 		if (tmp->qp == qp) {
@@ -366,7 +361,7 @@ static int rxe_attach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp)
 	if (err)
 		kfree(mca);
 out:
-	spin_unlock_irqrestore(&rxe->mcg_lock, flags);
+	spin_unlock_bh(&rxe->mcg_lock);
 	return err;
 }
 
@@ -400,9 +395,8 @@ static int rxe_detach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp)
 {
 	struct rxe_dev *rxe = mcg->rxe;
 	struct rxe_mca *mca, *tmp;
-	unsigned long flags;
 
-	spin_lock_irqsave(&rxe->mcg_lock, flags);
+	spin_lock_bh(&rxe->mcg_lock);
 	list_for_each_entry_safe(mca, tmp, &mcg->qp_list, qp_list) {
 		if (mca->qp == qp) {
 			__rxe_cleanup_mca(mca, mcg);
@@ -416,13 +410,13 @@ static int rxe_detach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp)
 			if (atomic_read(&mcg->qp_num) <= 0)
 				__rxe_destroy_mcg(mcg);
 
-			spin_unlock_irqrestore(&rxe->mcg_lock, flags);
+			spin_unlock_bh(&rxe->mcg_lock);
 			return 0;
 		}
 	}
 
 	/* we didn't find the qp on the list */
-	spin_unlock_irqrestore(&rxe->mcg_lock, flags);
+	spin_unlock_bh(&rxe->mcg_lock);
 	return -EINVAL;
 }