From 91ca18660e195df426522b29190940abb3010425 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:09 -0400 Subject: [PATCH 01/20] xprtrdma: xprt_release_rqst_cong is called outside of transport_lock Since commit ce7c252a8c74 ("SUNRPC: Add a separate spinlock to protect the RPC request receive list") the RPC/RDMA reply handler has been calling xprt_release_rqst_cong without holding xprt->transport_lock. I think the only way this call is ever made is if the credit grant increases and there are RPCs pending. Current server implementations do not change their credit grant during operation (except at connect time). Commit e7ce710a8802 ("xprtrdma: Avoid deadlock when credit window is reset") added the ->release_rqst call because UDP invokes xprt_adjust_cwnd(), which calls __xprt_put_cong() after adjusting xprt->cwnd. Both xprt_release() and ->xprt_release_xprt already wake another task in this case, so it is safe to remove this call from the reply handler. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c8ae983c6cc01..293b3d3e3e65d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1216,7 +1216,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpc_rqst *rqst = rep->rr_rqst; - unsigned long cwnd; int status; xprt->reestablish_timeout = 0; @@ -1239,11 +1238,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) out: spin_lock(&xprt->recv_lock); - cwnd = xprt->cwnd; - xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(rqst->rq_task); - xprt_complete_rqst(rqst->rq_task, status); xprt_unpin_rqst(rqst); spin_unlock(&xprt->recv_lock); @@ -1350,14 +1344,18 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (!rqst) goto out_norqst; xprt_pin_rqst(rqst); + spin_unlock(&xprt->recv_lock); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > buf->rb_max_requests) credits = buf->rb_max_requests; - buf->rb_credits = credits; - - spin_unlock(&xprt->recv_lock); + if (buf->rb_credits != credits) { + spin_lock_bh(&xprt->transport_lock); + buf->rb_credits = credits; + xprt->cwnd = credits << RPC_CWNDSHIFT; + spin_unlock_bh(&xprt->transport_lock); + } req = rpcr_to_rdmar(rqst); req->rl_reply = rep; From ef739b2175dde9c05594f768cb78149f1ce2ac36 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:14 -0400 Subject: [PATCH 02/20] xprtrdma: Reset credit grant properly after a disconnect On a fresh connection, an RPC/RDMA client is supposed to send only one RPC Call until it gets a credit grant in the first RPC Reply from the server [RFC 8166, Section 3.3.3]. There is a bug in the Linux client's credit accounting mechanism introduced by commit e7ce710a8802 ("xprtrdma: Avoid deadlock when credit window is reset"). On connect, it simply dumps all pending RPC Calls onto the new connection. Servers have been tolerant of this bad behavior. Currently no server implementation ever changes its credit grant over reconnects, and servers always repost enough Receives before connections are fully established. To correct this issue, ensure that the client resets both the credit grant _and_ the congestion window when handling a reconnect. Fixes: e7ce710a8802 ("xprtrdma: Avoid deadlock when credit ... ") Signed-off-by: Chuck Lever Cc: stable@kernel.org Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 1 + net/sunrpc/xprtrdma/transport.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a68180090554f..b9827665ff355 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -248,6 +248,7 @@ static void xprt_rdma_bc_close(struct rpc_xprt *xprt) { dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + xprt->cwnd = RPC_CWNDSHIFT; } static void diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 143ce2579ba90..98cbc7b060bad 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -468,6 +468,12 @@ xprt_rdma_close(struct rpc_xprt *xprt) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); rpcrdma_ep_disconnect(ep, ia); + + /* Prepare @xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ + r_xprt->rx_buf.rb_credits = 1; + xprt->cwnd = RPC_CWNDSHIFT; } /** From c421ece68f6952d4cc48ee81ebfc61ef0b83ad3b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:20 -0400 Subject: [PATCH 03/20] xprtrdma: Create more MRs at a time Some devices require more than 3 MRs to build a single 1MB I/O. Ensure that rpcrdma_mrs_create() will add enough MRs to build that I/O. In a subsequent patch I'm changing the MR recovery logic to just toss out the MRs. In that case it's possible for ->send_request to loop acquiring some MRs, not getting enough, getting called again, recycling the previous MRs, then not getting enough, lather rinse repeat. Thus first we need to ensure enough MRs are created to prevent that loop. I'm "reusing" ia->ri_max_segs. All of its accessors seem to want the maximum number of data segments plus two, so I'm going to bake that into the initial calculation. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 1 + net/sunrpc/xprtrdma/frwr_ops.c | 1 + net/sunrpc/xprtrdma/rpc_rdma.c | 2 -- net/sunrpc/xprtrdma/verbs.c | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 0f7c465d9a5aa..db589a23682b3 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -187,6 +187,7 @@ fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1bb00dd6ccdb8..1cc4db515c85f 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -276,6 +276,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / ia->ri_max_frwr_depth); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 293b3d3e3e65d..15edc050ca934 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -71,7 +71,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Read list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ @@ -97,7 +96,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Write list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = sizeof(__be32); /* segment count */ size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 956a5ea47b58e..5625a5089f96f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1019,7 +1019,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) LIST_HEAD(free); LIST_HEAD(all); - for (count = 0; count < 3; count++) { + for (count = 0; count < ia->ri_max_segs; count++) { struct rpcrdma_mr *mr; int rc; From 61da886bf74e738995d359fa14d77f72d14cfb87 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:25 -0400 Subject: [PATCH 04/20] xprtrdma: Explicitly resetting MRs is no longer necessary When a memory operation fails, the MR's driver state might not match its hardware state. The only reliable recourse is to dereg the MR. This is done in ->ro_recover_mr, which then attempts to allocate a fresh MR to replace the released MR. Since commit e2ac236c0b651 ("xprtrdma: Allocate MRs on demand"), xprtrdma dynamically allocates MRs. It can add more MRs whenever they are needed. That makes it possible to simply release an MR when a memory operation fails, instead of "recovering" it. It will automatically be replaced by the on-demand MR allocator. This commit is a little larger than I wanted, but it replaces ->ro_recover_mr, rb_recovery_lock, rb_recovery_worker, and the rb_stale_mrs list with a generic work queue. Since MRs are no longer orphaned, the mrs_orphaned metric is no longer used. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 2 +- net/sunrpc/xprtrdma/fmr_ops.c | 124 ++++++++++++++---------------- net/sunrpc/xprtrdma/frwr_ops.c | 130 ++++++++++++-------------------- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 38 ---------- net/sunrpc/xprtrdma/xprt_rdma.h | 14 ++-- 7 files changed, 114 insertions(+), 198 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 53df203b8057a..9906f4d7d9fb0 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -655,7 +655,7 @@ DEFINE_MR_EVENT(xprtrdma_localinv); DEFINE_MR_EVENT(xprtrdma_dma_map); DEFINE_MR_EVENT(xprtrdma_dma_unmap); DEFINE_MR_EVENT(xprtrdma_remoteinv); -DEFINE_MR_EVENT(xprtrdma_recover_mr); +DEFINE_MR_EVENT(xprtrdma_mr_recycle); /** ** Reply events diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index db589a23682b3..f1cf3a36a9926 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -49,46 +49,7 @@ fmr_is_supported(struct rpcrdma_ia *ia) return true; } -static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - static struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - - mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(u64), GFP_KERNEL); - if (!mr->fmr.fm_physaddrs) - goto out_free; - - mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) - goto out_free; - - sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - - mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, - &fmr_attr); - if (IS_ERR(mr->fmr.fm_mr)) - goto out_fmr_err; - - INIT_LIST_HEAD(&mr->mr_list); - return 0; - -out_fmr_err: - dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mr->fmr.fm_mr)); - -out_free: - kfree(mr->mr_sg); - kfree(mr->fmr.fm_physaddrs); - return -ENOMEM; -} - -static int +static void __fmr_unmap(struct rpcrdma_mr *mr) { LIST_HEAD(l); @@ -97,13 +58,16 @@ __fmr_unmap(struct rpcrdma_mr *mr) list_add(&mr->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); list_del(&mr->fmr.fm_mr->list); - return rc; + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + mr, rc); } +/* Release an MR. + */ static void fmr_op_release_mr(struct rpcrdma_mr *mr) { - LIST_HEAD(unmap_list); int rc; kfree(mr->fmr.fm_physaddrs); @@ -112,10 +76,7 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) /* In case this one was left mapped, try to unmap it * to prevent dealloc_fmr from failing with EBUSY */ - rc = __fmr_unmap(mr); - if (rc) - pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - mr, rc); + __fmr_unmap(mr); rc = ib_dealloc_fmr(mr->fmr.fm_mr); if (rc) @@ -125,28 +86,16 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* Reset of a single FMR. +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. */ static void -fmr_op_recover_mr(struct rpcrdma_mr *mr) +fmr_mr_recycle_worker(struct work_struct *work) { + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - int rc; - /* ORDER: invalidate first */ - rc = __fmr_unmap(mr); - if (rc) - goto out_release; - - /* ORDER: then DMA unmap */ - rpcrdma_mr_unmap_and_put(mr); - - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; + trace_xprtrdma_mr_recycle(mr); trace_xprtrdma_dma_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, @@ -154,11 +103,51 @@ fmr_op_recover_mr(struct rpcrdma_mr *mr) spin_lock(&r_xprt->rx_buf.rb_mrlock); list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mr); } +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +{ + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mr->fmr.fm_physaddrs) + goto out_free; + + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) + goto out_free; + + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); + + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mr->fmr.fm_mr)) + goto out_fmr_err; + + INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker); + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mr->fmr.fm_mr)); + +out_free: + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); + return -ENOMEM; +} + /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -312,7 +301,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) - goto out_reset; + goto out_release; /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. @@ -325,13 +314,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) return; -out_reset: +out_release: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); while (!list_empty(mrs)) { mr = rpcrdma_mr_pop(mrs); list_del(&mr->fmr.fm_mr->list); - fmr_op_recover_mr(mr); + rpcrdma_mr_recycle(mr); } } @@ -339,7 +328,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_send = fmr_op_send, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init_mr = fmr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1cc4db515c85f..6594627e3c7d8 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -97,6 +97,44 @@ frwr_is_supported(struct rpcrdma_ia *ia) return false; } +static void +frwr_op_release_mr(struct rpcrdma_mr *mr) +{ + int rc; + + rc = ib_dereg_mr(mr->frwr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + mr, rc); + kfree(mr->mr_sg); + kfree(mr); +} + +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. + */ +static void +frwr_mr_recycle_worker(struct work_struct *work) +{ + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + + trace_xprtrdma_mr_recycle(mr); + + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_dma_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + } + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; + spin_unlock(&r_xprt->rx_buf.rb_mrlock); + frwr_op_release_mr(mr); +} + static int frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { @@ -113,6 +151,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) goto out_list_err; INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); sg_init_table(mr->mr_sg, depth); init_completion(&frwr->fr_linv_done); return 0; @@ -131,79 +170,6 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) return rc; } -static void -frwr_op_release_mr(struct rpcrdma_mr *mr) -{ - int rc; - - rc = ib_dereg_mr(mr->frwr.fr_mr); - if (rc) - pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - mr, rc); - kfree(mr->mr_sg); - kfree(mr); -} - -static int -__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - struct rpcrdma_frwr *frwr = &mr->frwr; - int rc; - - rc = ib_dereg_mr(frwr->fr_mr); - if (rc) { - pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", - rc, mr); - return rc; - } - - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, - ia->ri_max_frwr_depth); - if (IS_ERR(frwr->fr_mr)) { - pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(frwr->fr_mr), mr); - return PTR_ERR(frwr->fr_mr); - } - - dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); - frwr->fr_state = FRWR_IS_INVALID; - return 0; -} - -/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. - */ -static void -frwr_op_recover_mr(struct rpcrdma_mr *mr) -{ - enum rpcrdma_frwr_state state = mr->frwr.fr_state; - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc; - - rc = __frwr_mr_reset(ia, mr); - if (state != FRWR_FLUSHED_LI) { - trace_xprtrdma_dma_unmap(mr); - ib_dma_unmap_sg(ia->ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - } - if (rc) - goto out_release; - - rpcrdma_mr_put(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - spin_lock(&r_xprt->rx_buf.rb_mrlock); - list_del(&mr->mr_all); - spin_unlock(&r_xprt->rx_buf.rb_mrlock); - - frwr_op_release_mr(mr); -} - /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -385,7 +351,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr = NULL; do { if (mr) - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); mr = rpcrdma_mr_get(r_xprt); if (!mr) return ERR_PTR(-EAGAIN); @@ -452,7 +418,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frwr->fr_mr, n, mr->mr_nents); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } @@ -571,7 +537,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) if (bad_wr != first) wait_for_completion(&frwr->fr_linv_done); if (rc) - goto reset_mrs; + goto out_release; /* ORDER: Now DMA unmap all of the MRs, and return * them to the free MR list. @@ -583,22 +549,21 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) } return; -reset_mrs: +out_release: pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); - /* Find and reset the MRs in the LOCAL_INV WRs that did not + /* Unmap and release the MRs in the LOCAL_INV WRs that did not * get posted. */ while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); - - __frwr_mr_reset(ia, mr); - bad_wr = bad_wr->next; + + list_del(&mr->mr_list); + frwr_op_release_mr(mr); } - goto unmap; } const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { @@ -606,7 +571,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_send = frwr_op_send, .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init_mr = frwr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 15edc050ca934..228aee8516672 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -803,7 +803,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) struct rpcrdma_mr *mr; mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); } /* This implementation supports the following combinations diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 98cbc7b060bad..3ae73e6a5c939 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -792,7 +792,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", - r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_recycled, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, r_xprt->rx_stats.local_inv_needed, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 5625a5089f96f..88fe75e6d41a3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -977,39 +977,6 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) } } -static void -rpcrdma_mr_recovery_worker(struct work_struct *work) -{ - struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_recovery_worker.work); - struct rpcrdma_mr *mr; - - spin_lock(&buf->rb_recovery_lock); - while (!list_empty(&buf->rb_stale_mrs)) { - mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - trace_xprtrdma_recover_mr(mr); - mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); - - spin_lock(&buf->rb_recovery_lock); - } - spin_unlock(&buf->rb_recovery_lock); -} - -void -rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - - spin_lock(&buf->rb_recovery_lock); - rpcrdma_mr_push(mr, &buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - schedule_delayed_work(&buf->rb_recovery_worker, 0); -} - static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { @@ -1142,14 +1109,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); - spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all); - INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); - INIT_DELAYED_WORK(&buf->rb_recovery_worker, - rpcrdma_mr_recovery_worker); rpcrdma_mrs_create(r_xprt); @@ -1233,7 +1196,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_recovery_worker); cancel_delayed_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2ca14f7c2d51a..eae21668e6922 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -280,6 +280,7 @@ struct rpcrdma_mr { u32 mr_handle; u32 mr_length; u64 mr_offset; + struct work_struct mr_recycle; struct list_head mr_all; }; @@ -411,9 +412,6 @@ struct rpcrdma_buffer { u32 rb_bc_max_requests; - spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ - struct list_head rb_stale_mrs; - struct delayed_work rb_recovery_worker; struct delayed_work rb_refresh_worker; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -452,7 +450,7 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; - unsigned long mrs_recovered; + unsigned long mrs_recycled; unsigned long mrs_orphaned; unsigned long mrs_allocated; unsigned long empty_sendctx_q; @@ -481,7 +479,6 @@ struct rpcrdma_memreg_ops { struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_recover_mr)(struct rpcrdma_mr *mr); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); @@ -578,7 +575,12 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + +static inline void +rpcrdma_mr_recycle(struct rpcrdma_mr *mr) +{ + schedule_work(&mr->mr_recycle); +} struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); From d379eaa838f1813ca906b946ad3cbb77781d2be7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:30 -0400 Subject: [PATCH 05/20] xprtrdma: Name MR trace events consistently Clean up the names of trace events related to MRs so that it's easy to enable these with a glob. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 12 ++++++------ net/sunrpc/xprtrdma/fmr_ops.c | 6 +++--- net/sunrpc/xprtrdma/frwr_ops.c | 8 ++++---- net/sunrpc/xprtrdma/verbs.c | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 9906f4d7d9fb0..89e017b0f0ec6 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -263,7 +263,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ); #define DEFINE_MR_EVENT(name) \ - DEFINE_EVENT(xprtrdma_mr, name, \ + DEFINE_EVENT(xprtrdma_mr, xprtrdma_mr_##name, \ TP_PROTO( \ const struct rpcrdma_mr *mr \ ), \ @@ -651,11 +651,11 @@ DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg); DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li); DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake); -DEFINE_MR_EVENT(xprtrdma_localinv); -DEFINE_MR_EVENT(xprtrdma_dma_map); -DEFINE_MR_EVENT(xprtrdma_dma_unmap); -DEFINE_MR_EVENT(xprtrdma_remoteinv); -DEFINE_MR_EVENT(xprtrdma_mr_recycle); +DEFINE_MR_EVENT(localinv); +DEFINE_MR_EVENT(map); +DEFINE_MR_EVENT(unmap); +DEFINE_MR_EVENT(remoteinv); +DEFINE_MR_EVENT(recycle); /** ** Reply events diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index f1cf3a36a9926..7f5632cd5a48a 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -97,7 +97,7 @@ fmr_mr_recycle_worker(struct work_struct *work) trace_xprtrdma_mr_recycle(mr); - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); @@ -234,7 +234,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); @@ -295,7 +295,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) { dprintk("RPC: %s: unmapping fmr %p\n", __func__, &mr->fmr); - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); } r_xprt->rx_stats.local_inv_needed++; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 6594627e3c7d8..fc6378cc0c1c7 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -123,7 +123,7 @@ frwr_mr_recycle_worker(struct work_struct *work) trace_xprtrdma_mr_recycle(mr); if (state != FRWR_FLUSHED_LI) { - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); } @@ -384,7 +384,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); ibmr = frwr->fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); @@ -466,7 +466,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_remoteinv(mr); + trace_xprtrdma_mr_remoteinv(mr); mr->frwr.fr_state = FRWR_IS_INVALID; rpcrdma_mr_unmap_and_put(mr); break; /* only one invalidated MR per RPC */ @@ -503,7 +503,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) mr->frwr.fr_state = FRWR_IS_INVALID; frwr = &mr->frwr; - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 88fe75e6d41a3..3a9a62d282834 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1288,7 +1288,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); __rpcrdma_mr_put(&r_xprt->rx_buf, mr); From 3968a8a5310404c2f0b9e4d9f28cab13a12bc4fd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:36 -0400 Subject: [PATCH 06/20] sunrpc: Fix connect metrics For TCP, the logic in xprt_connect_status is currently never invoked to record a successful connection. Commit 2a4919919a97 ("SUNRPC: Return EAGAIN instead of ENOTCONN when waking up xprt->pending") changed the way TCP xprt's are awoken after a connect succeeds. Instead, change connection-oriented transports to bump connect_count and compute connect_time the moment that XPRT_CONNECTED is set. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 14 ++++---------- net/sunrpc/xprtrdma/transport.c | 6 +++++- net/sunrpc/xprtsock.c | 10 ++++++---- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a8db2e3f89044..93c7a2f4a2664 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -789,17 +789,11 @@ void xprt_connect(struct rpc_task *task) static void xprt_connect_status(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - - if (task->tk_status == 0) { - xprt->stat.connect_count++; - xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; + switch (task->tk_status) { + case 0: dprintk("RPC: %5u xprt_connect_status: connection established\n", task->tk_pid); - return; - } - - switch (task->tk_status) { + break; case -ECONNREFUSED: case -ECONNRESET: case -ECONNABORTED: @@ -816,7 +810,7 @@ static void xprt_connect_status(struct rpc_task *task) default: dprintk("RPC: %5u xprt_connect_status: error %d connecting to " "server %s\n", task->tk_pid, -task->tk_status, - xprt->servername); + task->tk_rqstp->rq_xprt->servername); task->tk_status = -EIO; } } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 3ae73e6a5c939..087acfce142a3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -242,8 +242,12 @@ rpcrdma_connect_worker(struct work_struct *work) spin_lock_bh(&xprt->transport_lock); if (ep->rep_connected > 0) { - if (!xprt_test_and_set_connected(xprt)) + if (!xprt_test_and_set_connected(xprt)) { + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_wake_pending_tasks(xprt, 0); + } } else { if (xprt_test_and_clear_connected(xprt)) xprt_wake_pending_tasks(xprt, -ENOTCONN); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6b7539c0466e8..e146caacc494c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1611,6 +1611,9 @@ static void xs_tcp_state_change(struct sock *sk) clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); xprt_clear_connecting(xprt); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_wake_pending_tasks(xprt, -EAGAIN); } spin_unlock(&xprt->transport_lock); @@ -2029,8 +2032,6 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, } /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); } @@ -2062,6 +2063,9 @@ static int xs_local_setup_socket(struct sock_xprt *transport) case 0: dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_set_connected(xprt); case -ENOBUFS: break; @@ -2387,8 +2391,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_set_memalloc(xprt); /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { From 8440a886112b46a8b402679dca9d8b5662a0d73e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:41 -0400 Subject: [PATCH 07/20] sunrpc: Report connect_time in seconds The way connection-oriented transports report connect_time is wrong: it's supposed to be in seconds, not in jiffies. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtsock.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 087acfce142a3..289d13cad638c 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -776,7 +776,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) 0, /* need a local port? */ xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e146caacc494c..9bbc395cfd55d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2563,7 +2563,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) "%llu %llu %lu %llu %llu\n", xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2618,7 +2618,7 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) transport->srcport, xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, From ae38288eb73c52e45917fe7d05d34b84a14a7930 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:47 -0400 Subject: [PATCH 08/20] xprtrdma: Rename rpcrdma_conn_upcall Clean up: Use a function name that is consistent with the RDMA core API and with other consumers. Because this is a function that is invoked from outside the rpcrdma.ko module, add an appropriate documenting comment. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 2 +- net/sunrpc/xprtrdma/verbs.c | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 89e017b0f0ec6..3b9de5b6b725d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -306,7 +306,7 @@ DECLARE_EVENT_CLASS(xprtrdma_cb_event, ** Connection events **/ -TRACE_EVENT(xprtrdma_conn_upcall, +TRACE_EVENT(xprtrdma_cm_event, TP_PROTO( const struct rpcrdma_xprt *r_xprt, struct rdma_cm_event *event diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3a9a62d282834..7bf0e6529f41d 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -219,15 +219,25 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, rpcrdma_set_max_header_sizes(r_xprt); } +/** + * rpcrdma_cm_event_handler - Handle RDMA CM events + * @id: rdma_cm_id on which an event has occurred + * @event: details of the event + * + * Called with @id's mutex held. Returns 1 if caller should + * destroy @id, otherwise 0. + */ static int -rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) +rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rpcrdma_xprt *xprt = id->context; struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; int connstate = 0; - trace_xprtrdma_conn_upcall(xprt, event); + might_sleep(); + + trace_xprtrdma_cm_event(xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: @@ -308,7 +318,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall, + id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); From ed97f1f79be9868692d115a72c379900231efeb5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:52 -0400 Subject: [PATCH 09/20] xprtrdma: Conventional variable names in rpcrdma_conn_upcall Clean up: The convention throughout other parts of xprtrdma is to name variables of type struct rpcrdma_xprt "r_xprt", not "xprt". This convention enables the use of the name "xprt" for a "struct rpc_xprt" type variable, as in other parts of the RPC client. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 7bf0e6529f41d..f2c8c3c71d570 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -230,14 +230,15 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct rpcrdma_xprt *xprt = id->context; - struct rpcrdma_ia *ia = &xprt->rx_ia; - struct rpcrdma_ep *ep = &xprt->rx_ep; + struct rpcrdma_xprt *r_xprt = id->context; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; int connstate = 0; might_sleep(); - trace_xprtrdma_cm_event(xprt, event); + trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: @@ -256,11 +257,11 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", ia->ri_device->name, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; - xprt_force_disconnect(&xprt->rx_xprt); + xprt_force_disconnect(xprt); wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; @@ -268,9 +269,9 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->rx_xprt.connect_cookie; + ++xprt->connect_cookie; connstate = 1; - rpcrdma_update_connect_private(xprt, &event->param.conn); + rpcrdma_update_connect_private(r_xprt, &event->param.conn); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: connstate = -ENOTCONN; @@ -280,14 +281,14 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) goto connected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), rdma_reject_msg(id, event->status)); connstate = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) connstate = -EAGAIN; goto connected; case RDMA_CM_EVENT_DISCONNECTED: - ++xprt->rx_xprt.connect_cookie; + ++xprt->connect_cookie; connstate = -ECONNABORTED; connected: ep->rep_connected = connstate; @@ -297,7 +298,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) default: dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", __func__, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), ia->ri_device->name, ia->ri_ops->ro_displayname, ep, rdma_event_msg(event->event)); break; From aadc5a94483b138c8d9ade6e8416b089733a34dd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:25:57 -0400 Subject: [PATCH 10/20] xprtrdma: Eliminate "connstate" variable from rpcrdma_conn_upcall() Clean up. Since commit 173b8f49b3af ("xprtrdma: Demote "connect" log messages") there has been no need to initialize connstat to zero. In fact, in this code path there's now no reason not to set rep_connected directly. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f2c8c3c71d570..422d3db7b9a2a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -234,7 +234,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpc_xprt *xprt = &r_xprt->rx_xprt; - int connstate = 0; might_sleep(); @@ -270,28 +269,27 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) return 1; case RDMA_CM_EVENT_ESTABLISHED: ++xprt->connect_cookie; - connstate = 1; + ep->rep_connected = 1; rpcrdma_update_connect_private(r_xprt, &event->param.conn); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: - connstate = -ENOTCONN; + ep->rep_connected = -ENOTCONN; goto connected; case RDMA_CM_EVENT_UNREACHABLE: - connstate = -ENETUNREACH; + ep->rep_connected = -ENETUNREACH; goto connected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), rdma_reject_msg(id, event->status)); - connstate = -ECONNREFUSED; + ep->rep_connected = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - connstate = -EAGAIN; + ep->rep_connected = -EAGAIN; goto connected; case RDMA_CM_EVENT_DISCONNECTED: ++xprt->connect_cookie; - connstate = -ECONNABORTED; + ep->rep_connected = -ECONNABORTED; connected: - ep->rep_connected = connstate; rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ From 316a616e7886583c03d00f8320458b713c1dd277 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:03 -0400 Subject: [PATCH 11/20] xprtrdma: Re-organize the switch() in rpcrdma_conn_upcall Clean up: Eliminate the FALLTHROUGH into the default arm to make the switch easier to understand. Also, as long as I'm here, do not display the memory address of the target rpcrdma_ep. A hashed memory address is of marginal use here. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 422d3db7b9a2a..c60172f88a0d6 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -243,15 +243,15 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_ROUTE_RESOLVED: ia->ri_async_rc = 0; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ADDR_ERROR: ia->ri_async_rc = -EPROTO; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ROUTE_ERROR: ia->ri_async_rc = -ENETUNREACH; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", @@ -292,16 +292,15 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) connected: rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); - /*FALLTHROUGH*/ + break; default: - dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", - __func__, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_device->name, ia->ri_ops->ro_displayname, - ep, rdma_event_msg(event->event)); break; } + dprintk("RPC: %s: %s:%s on %s/%s: %s\n", __func__, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), + ia->ri_device->name, ia->ri_ops->ro_displayname, + rdma_event_msg(event->event)); return 0; } From 31e62d25b5b8155b2ff6a7c6d31256475dbbcc7a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:08 -0400 Subject: [PATCH 12/20] xprtrdma: Simplify RPC wake-ups on connect Currently, when a connection is established, rpcrdma_conn_upcall invokes rpcrdma_conn_func and then wake_up_all(&ep->rep_connect_wait). The former wakes waiting RPCs, but the connect worker is not done yet, and that leads to races, double wakes, and difficulty understanding how this logic is supposed to work. Instead, collect all the "connection established" logic in the connect worker (xprt_rdma_connect_worker). A disconnect worker is retained to handle provider upcalls safely. Fixes: 254f91e2fa1f ("xprtrdma: RPC/RDMA must invoke ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 54 +++++++++++---------------------- net/sunrpc/xprtrdma/verbs.c | 42 +++++++++++++++++++------ net/sunrpc/xprtrdma/xprt_rdma.h | 4 +-- 3 files changed, 52 insertions(+), 48 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 289d13cad638c..d7c4255e9d5db 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -225,51 +225,35 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt) } } -void -rpcrdma_conn_func(struct rpcrdma_ep *ep) -{ - schedule_delayed_work(&ep->rep_connect_worker, 0); -} - -void -rpcrdma_connect_worker(struct work_struct *work) +/** + * xprt_rdma_connect_worker - establish connection in the background + * @work: worker thread context + * + * Requester holds the xprt's send lock to prevent activity on this + * transport while a fresh connection is being established. RPC tasks + * sleep on the xprt's pending queue waiting for connect to complete. + */ +static void +xprt_rdma_connect_worker(struct work_struct *work) { - struct rpcrdma_ep *ep = - container_of(work, struct rpcrdma_ep, rep_connect_worker.work); - struct rpcrdma_xprt *r_xprt = - container_of(ep, struct rpcrdma_xprt, rx_ep); + struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, + rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + int rc; - spin_lock_bh(&xprt->transport_lock); - if (ep->rep_connected > 0) { + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + xprt_clear_connecting(xprt); + if (r_xprt->rx_ep.rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) { xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; - xprt_wake_pending_tasks(xprt, 0); + xprt_wake_pending_tasks(xprt, -EAGAIN); } } else { if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); + xprt_wake_pending_tasks(xprt, rc); } - spin_unlock_bh(&xprt->transport_lock); -} - -static void -xprt_rdma_connect_worker(struct work_struct *work) -{ - struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, - rx_connect_worker.work); - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - int rc = 0; - - xprt_clear_connected(xprt); - - rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - xprt_wake_pending_tasks(xprt, rc); - - xprt_clear_connecting(xprt); } static void @@ -302,8 +286,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - xprt_clear_connected(xprt); - rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c60172f88a0d6..abbd3cdc259af 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -108,6 +108,25 @@ rpcrdma_destroy_wq(void) } } +/** + * rpcrdma_disconnect_worker - Force a disconnect + * @work: endpoint to be disconnected + * + * Provider callbacks can possibly run in an IRQ context. This function + * is invoked in a worker thread to guarantee that disconnect wake-up + * calls are always done in process context. + */ +static void +rpcrdma_disconnect_worker(struct work_struct *work) +{ + struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, + rep_disconnect_worker.work); + struct rpcrdma_xprt *r_xprt = + container_of(ep, struct rpcrdma_xprt, rx_ep); + + xprt_force_disconnect(&r_xprt->rx_xprt); +} + static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { @@ -121,7 +140,7 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) if (ep->rep_connected == 1) { ep->rep_connected = -EIO; - rpcrdma_conn_func(ep); + schedule_delayed_work(&ep->rep_disconnect_worker, 0); wake_up_all(&ep->rep_connect_wait); } } @@ -271,13 +290,14 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ++xprt->connect_cookie; ep->rep_connected = 1; rpcrdma_update_connect_private(r_xprt, &event->param.conn); - goto connected; + wake_up_all(&ep->rep_connect_wait); + break; case RDMA_CM_EVENT_CONNECT_ERROR: ep->rep_connected = -ENOTCONN; - goto connected; + goto disconnected; case RDMA_CM_EVENT_UNREACHABLE: ep->rep_connected = -ENETUNREACH; - goto connected; + goto disconnected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), @@ -285,12 +305,12 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->rep_connected = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) ep->rep_connected = -EAGAIN; - goto connected; + goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: ++xprt->connect_cookie; ep->rep_connected = -ECONNABORTED; -connected: - rpcrdma_conn_func(ep); +disconnected: + xprt_force_disconnect(xprt); wake_up_all(&ep->rep_connect_wait); break; default: @@ -550,7 +570,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cdata->max_requests >> 2); ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); - INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + INIT_DELAYED_WORK(&ep->rep_disconnect_worker, + rpcrdma_disconnect_worker); sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, @@ -623,7 +644,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - cancel_delayed_work_sync(&ep->rep_connect_worker); + cancel_delayed_work_sync(&ep->rep_disconnect_worker); if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); @@ -736,6 +757,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; int rc; retry: @@ -762,6 +784,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) } ep->rep_connected = 0; + xprt_clear_connected(xprt); + rpcrdma_post_recvs(r_xprt, true); rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index eae21668e6922..a13ccb643ce07 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -101,7 +101,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct delayed_work rep_connect_worker; + struct delayed_work rep_disconnect_worker; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -556,7 +556,6 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_conn_func(struct rpcrdma_ep *ep); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, @@ -654,7 +653,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) extern unsigned int xprt_rdma_max_inline_read; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); -void rpcrdma_connect_worker(struct work_struct *work); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); From f9521d53e804b9721c2829858f6d5bf6f470e734 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:13 -0400 Subject: [PATCH 13/20] xprtrdma: Rename rpcrdma_qp_async_error_upcall Clean up: Use a function name that is consistent with the RDMA core API and with other consumers. Because this is a function that is invoked from outside the rpcrdma.ko module, add an appropriate documenting comment. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 2 +- net/sunrpc/xprtrdma/verbs.c | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 3b9de5b6b725d..a4d9ff9c36d48 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -377,7 +377,7 @@ DEFINE_RXPRT_EVENT(xprtrdma_reinsert); DEFINE_RXPRT_EVENT(xprtrdma_reconnect); DEFINE_RXPRT_EVENT(xprtrdma_inject_dsc); -TRACE_EVENT(xprtrdma_qp_error, +TRACE_EVENT(xprtrdma_qp_event, TP_PROTO( const struct rpcrdma_xprt *r_xprt, const struct ib_event *event diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index abbd3cdc259af..b62260aa348b4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -127,14 +127,22 @@ rpcrdma_disconnect_worker(struct work_struct *work) xprt_force_disconnect(&r_xprt->rx_xprt); } +/** + * rpcrdma_qp_event_handler - Handle one QP event (error notification) + * @event: details of the event + * @context: ep that owns QP where event occurred + * + * Called from the RDMA provider (device driver) possibly in an interrupt + * context. + */ static void -rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) +rpcrdma_qp_event_handler(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, rx_ep); - trace_xprtrdma_qp_error(r_xprt, event); + trace_xprtrdma_qp_event(r_xprt, event); pr_err("rpcrdma: %s on device %s ep %p\n", ib_event_msg(event->event), event->device->name, context); @@ -547,7 +555,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, if (rc) return rc; - ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; + ep->rep_attr.event_handler = rpcrdma_qp_event_handler; ep->rep_attr.qp_context = ep; ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_sge = max_sge; From 83e301dd1347bb98419103685e48c2b4835937db Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:19 -0400 Subject: [PATCH 14/20] xprtrdma: Remove memory address of "ep" from an error message Clean up: Replace the hashed memory address of the target rpcrdma_ep with the server's IP address and port. The server address is more useful in an administrative error message. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b62260aa348b4..9e044534b1b38 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -143,8 +143,9 @@ rpcrdma_qp_event_handler(struct ib_event *event, void *context) rx_ep); trace_xprtrdma_qp_event(r_xprt, event); - pr_err("rpcrdma: %s on device %s ep %p\n", - ib_event_msg(event->event), event->device->name, context); + pr_err("rpcrdma: %s on device %s connected to %s:%s\n", + ib_event_msg(event->event), event->device->name, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; From f7d4668155245d90470f591576b6593e0f077fcb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:24 -0400 Subject: [PATCH 15/20] xprtrdma: Don't disable BH's in backchannel server Clean up: This code was copied from xprtsock.c and backchannel_rqst.c. For rpcrdma, the backchannel server runs exclusively in process context, thus disabling bottom-halves is unnecessary. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 90adeff4c06b8..675b5308b7c37 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -54,9 +54,9 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, INIT_LIST_HEAD(&rqst->rq_list); INIT_LIST_HEAD(&rqst->rq_bc_list); __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); @@ -228,16 +228,16 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpc_rqst *rqst, *tmp; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { list_del(&rqst->rq_bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); rpcrdma_bc_free_rqst(r_xprt, rqst); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); } - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** @@ -255,9 +255,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) rpcrdma_recv_buffer_put(req->rl_reply); req->rl_reply = NULL; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** From 512ccfb61a9b7281345992620c73ffed20272526 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:29 -0400 Subject: [PATCH 16/20] xprtrdma: Move rb_flags initialization Clean up: rb_flags might be used for other things besides RPCRDMA_BUF_F_EMPTY_SCQ, so initialize it in a generic spot instead of in a send-completion-queue-related helper. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 9e044534b1b38..cfaab5e312778 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -918,7 +918,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) sc->sc_xprt = r_xprt; buf->rb_sc_ctxs[i] = sc; } - buf->rb_flags = 0; return 0; @@ -1146,6 +1145,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; int i, rc; + buf->rb_flags = 0; buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_mrlock); From 61c208a5ca94c526143830253d56de6fdb95edc2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:35 -0400 Subject: [PATCH 17/20] xprtrdma: Report when there were zero posted Receives To show that a caller did attempt to allocate and post more Receive buffers, the trace point in rpcrdma_post_recvs() should report when rpcrdma_post_recvs() was invoked but no new Receive buffers were posted. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index cfaab5e312778..3ddba94c939f6 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1521,9 +1521,11 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) struct ib_recv_wr *wr, *bad_wr; int needed, count, rc; + rc = 0; + count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); if (buf->rb_posted_receives > needed) - return; + goto out; needed -= buf->rb_posted_receives; count = 0; @@ -1559,7 +1561,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) --needed; } if (!count) - return; + goto out; rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); @@ -1573,5 +1575,6 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) } } buf->rb_posted_receives += count; +out: trace_xprtrdma_post_recvs(r_xprt, count, rc); } From f26c32fa5c6ab616e12d4ff0748c85927b092601 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:40 -0400 Subject: [PATCH 18/20] xprtrdma: Add documenting comments Clean up: fill in or update documenting comments for transport switch entry points. For xprt_rdma_allocate: The first paragraph is no longer true since commit 5a6d1db45569 ("SUNRPC: Add a transport-specific private field in rpc_rqst"). The second paragraph is no longer true since commit 54cbd6b0c6b9 ("xprtrdma: Delay DMA mapping Send and Receive buffers"). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 43 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d7c4255e9d5db..39b7991862d08 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -256,6 +256,13 @@ xprt_rdma_connect_worker(struct work_struct *work) } } +/** + * xprt_rdma_inject_disconnect - inject a connection fault + * @xprt: transport context + * + * If @xprt is connected, disconnect it to simulate spurious connection + * loss. + */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { @@ -266,16 +273,12 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) rdma_disconnect(r_xprt->rx_ia.ri_id); } -/* - * xprt_rdma_destroy +/** + * xprt_rdma_destroy - Full tear down of transport + * @xprt: doomed transport context * - * Destroy the xprt. - * Free all memory associated with the object, including its own. - * NOTE: none of the *destroy methods free memory for their top-level - * objects, even though they may have allocated it (they do free - * private memory). It's up to the caller to handle it. In this - * case (RDMA transport), all structure memory is inlined with the - * struct rpcrdma_xprt. + * Caller guarantees there will be no more calls to us with + * this @xprt. */ static void xprt_rdma_destroy(struct rpc_xprt *xprt) @@ -428,11 +431,12 @@ xprt_setup_rdma(struct xprt_create *args) } /** - * xprt_rdma_close - Close down RDMA connection - * @xprt: generic transport to be closed + * xprt_rdma_close - close a transport connection + * @xprt: transport context * - * Called during transport shutdown reconnect, or device - * removal. Caller holds the transport's write lock. + * Called during transport shutdown, reconnect, or device removal. + * Caller holds @xprt's send lock to prevent activity on this + * transport while the connection is torn down. */ static void xprt_rdma_close(struct rpc_xprt *xprt) @@ -511,6 +515,12 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) xprt_force_disconnect(xprt); } +/** + * xprt_rdma_connect - try to establish a transport connection + * @xprt: transport state + * @task: RPC scheduler context + * + */ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -630,13 +640,6 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * 0: Success; rq_buffer points to RPC buffer to use * ENOMEM: Out of memory, call again later * EIO: A permanent error occurred, do not retry - * - * The RDMA allocate/free functions need the task structure as a place - * to hide the struct rpcrdma_req, which is necessary for the actual - * send/recv sequence. - * - * xprt_rdma_allocate provides buffers that are already mapped for - * DMA, and a local DMA lkey is provided for each. */ static int xprt_rdma_allocate(struct rpc_task *task) From ad0911802cf6be48ddf5911ca1837d071b26e92d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:45 -0400 Subject: [PATCH 19/20] xprtrdma: Clean up xprt_rdma_disconnect_inject Clean up: Use the appropriate C macro instead of open-coding container_of() . Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 39b7991862d08..0cfa7bf411183 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -266,8 +266,7 @@ xprt_rdma_connect_worker(struct work_struct *work) static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, - rx_xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); From 470443e0b379b070305629f911cc09562bdf324f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:26:51 -0400 Subject: [PATCH 20/20] xprtrdma: Squelch a sparse warning linux/include/trace/events/rpcrdma.h:501:1: warning: expression using sizeof bool linux/include/trace/events/rpcrdma.h:501:1: warning: odd constant _Bool cast (ffffffffffffffff becomes 1) Fixes: ab03eff58eb5 ("xprtrdma: Add trace points in RPC Call ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index a4d9ff9c36d48..b093058f78aac 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -509,7 +509,7 @@ TRACE_EVENT(xprtrdma_post_send, TP_STRUCT__entry( __field(const void *, req) __field(int, num_sge) - __field(bool, signaled) + __field(int, signaled) __field(int, status) ),