From e999ef15423b35473c0d94ff3dd2011a0f7ddb04 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Fri, 28 Jun 2024 10:20:04 -0700 Subject: [PATCH 1/8] scsi: lpfc: Cancel ELS WQE instead of issuing abort when SLI port is inactive During SLI port errata events, there should be no expectation that submitted outstanding WQEs will return back CQEs. In these situations, the driver should not rely on receiving CQEs from the SLI port to signal WQE resource clean up. Put an sli_flag LPFC_SLI_ACTIVE check in lpfc_els_flush_cmd() when walking the txcmplq. The sli_flag check helps determine whether to issue an abort or driver based cancel on outstanding WQEs. If !LPFC_SLI_ACTIVE, then there's no point to issue anything to the SLI port. Instead, let the driver based cancel logic clean up the submitted WQE resources. Also, enhance some abort log messages that help with future debugging. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240628172011.25921-2-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 2 +- drivers/scsi/lpfc/lpfc_sli.c | 24 +++++++++++------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index c32bc773ab29d..b27ebb574bfbb 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -9665,7 +9665,7 @@ lpfc_els_flush_cmd(struct lpfc_vport *vport) list_for_each_entry_safe(piocb, tmp_iocb, &abort_list, dlist) { spin_lock_irqsave(&phba->hbalock, iflags); list_del_init(&piocb->dlist); - if (mbx_tmo_err) + if (mbx_tmo_err || !(phba->sli.sli_flag & LPFC_SLI_ACTIVE)) list_move_tail(&piocb->list, &cancel_list); else lpfc_sli_issue_abort_iotag(phba, pring, piocb, NULL); diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index f475e7ece41a4..8bfac91433147 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -12301,18 +12301,16 @@ lpfc_sli_abort_els_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, goto release_iocb; } } - - lpfc_printf_log(phba, KERN_WARNING, LOG_ELS | LOG_SLI, - "0327 Cannot abort els iocb x%px " - "with io cmd xri %x abort tag : x%x, " - "abort status %x abort code %x\n", - cmdiocb, get_job_abtsiotag(phba, cmdiocb), - (phba->sli_rev == LPFC_SLI_REV4) ? - get_wqe_reqtag(cmdiocb) : - cmdiocb->iocb.un.acxri.abortContextTag, - ulp_status, ulp_word4); - } + + lpfc_printf_log(phba, KERN_INFO, LOG_ELS | LOG_SLI, + "0327 Abort els iocb complete x%px with io cmd xri %x " + "abort tag x%x abort status %x abort code %x\n", + cmdiocb, get_job_abtsiotag(phba, cmdiocb), + (phba->sli_rev == LPFC_SLI_REV4) ? + get_wqe_reqtag(cmdiocb) : + cmdiocb->iocb.ulpIoTag, + ulp_status, ulp_word4); release_iocb: lpfc_sli_release_iocbq(phba, cmdiocb); return; @@ -12509,10 +12507,10 @@ lpfc_sli_issue_abort_iotag(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI, "0339 Abort IO XRI x%x, Original iotag x%x, " "abort tag x%x Cmdjob : x%px Abortjob : x%px " - "retval x%x : IA %d\n", + "retval x%x : IA %d cmd_cmpl %ps\n", ulp_context, (phba->sli_rev == LPFC_SLI_REV4) ? cmdiocb->iotag : iotag, iotag, cmdiocb, abtsiocbp, - retval, ia); + retval, ia, abtsiocbp->cmd_cmpl); if (retval) { cmdiocb->cmd_flag &= ~LPFC_DRIVER_ABORTED; __lpfc_sli_release_iocbq(phba, abtsiocbp); From 9609385dd91b26751019b22ca9bfa4bec7602ae1 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Fri, 28 Jun 2024 10:20:05 -0700 Subject: [PATCH 2/8] scsi: lpfc: Allow DEVICE_RECOVERY mode after RSCN receipt if in PRLI_ISSUE state Certain vendor specific targets initially register with the fabric as an initiator function first and then re-register as a target function afterwards. The timing of the target function re-registration can cause a race condition such that the driver is stuck assuming the remote port as an initiator function and never discovers the target's hosted LUNs. Expand the nlp_state qualifier to also include NLP_STE_PRLI_ISSUE because the state means that PRLI was issued but we have not quite reached MAPPED_NODE state yet. If we received an RSCN in the PRLI_ISSUE state, then we should restart discovery again by going into DEVICE_RECOVERY. Fixes: dded1dc31aa4 ("scsi: lpfc: Modify when a node should be put in device recovery mode during RSCN") Cc: # v6.6+ Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240628172011.25921-3-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_hbadisc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 153770bdc56ab..13b08c85440fe 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -5725,7 +5725,7 @@ lpfc_setup_disc_node(struct lpfc_vport *vport, uint32_t did) return ndlp; if (ndlp->nlp_state > NLP_STE_UNUSED_NODE && - ndlp->nlp_state < NLP_STE_PRLI_ISSUE) { + ndlp->nlp_state <= NLP_STE_PRLI_ISSUE) { lpfc_disc_state_machine(vport, ndlp, NULL, NLP_EVT_DEVICE_RECOVERY); } From aeaf117cc7d28b994652491c87b929d853519d20 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Fri, 28 Jun 2024 10:20:06 -0700 Subject: [PATCH 3/8] scsi: lpfc: Relax PRLI issue conditions after GID_FT response If previously in REG_LOGIN_ISSUE state, then remove the requirement that PLOGI must have been received from the remote port before issuing a PRLI. After GID_FT completes, it does not matter whether the driver itself sent a PLOGI or received one. The fact that we're in REG_LOGIN_ISSUE state simply means that the next state should be issuing the PRLI to continue discovery of the remote port. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240628172011.25921-4-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_ct.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index 376d0f958b723..2dedd1493e5ba 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -1553,22 +1553,14 @@ lpfc_cmpl_ct_cmd_gft_id(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, if (ndlp->nlp_state == NLP_STE_REG_LOGIN_ISSUE && ndlp->nlp_fc4_type) { ndlp->nlp_prev_state = NLP_STE_REG_LOGIN_ISSUE; - /* This is a fabric topology so if discovery - * started with an unsolicited PLOGI, don't - * send a PRLI. Targets don't issue PLOGI or - * PRLI when acting as a target. Likely this is - * an initiator function. - */ - if (!(ndlp->nlp_flag & NLP_RCV_PLOGI)) { - lpfc_nlp_set_state(vport, ndlp, - NLP_STE_PRLI_ISSUE); - lpfc_issue_els_prli(vport, ndlp, 0); - } + lpfc_nlp_set_state(vport, ndlp, + NLP_STE_PRLI_ISSUE); + lpfc_issue_els_prli(vport, ndlp, 0); } else if (!ndlp->nlp_fc4_type) { /* If fc4 type is still unknown, then LOGO */ lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY | LOG_NODE, - "6443 Sending LOGO ndlp x%px," + "6443 Sending LOGO ndlp x%px, " "DID x%06x with fc4_type: " "x%08x, state: %d\n", ndlp, did, ndlp->nlp_fc4_type, From 15e21dc6d6b7d7feb6b0262422b45da5e3c4b34f Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Fri, 28 Jun 2024 10:20:07 -0700 Subject: [PATCH 4/8] scsi: lpfc: Fix handling of fully recovered fabric node in dev_loss callbk In rare cases when a fabric node is recovered after a link bounce and before dev_loss_tmo callbk is reached, the driver may leave the fabric node in an inconsistent state with the NLP_IN_DEV_LOSS flag perpetually set. In lpfc_dev_loss_tmo_callbk, a check is added for a recovered fabric node. If the node is recovered, then don't queue the lpfc_dev_loss_tmo_handler work. In lpfc_dev_loss_tmo_handler, the path taken for the recovered fabric nodes is updated to clear the NLP_IN_DEV_LOSS flag. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240628172011.25921-5-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_hbadisc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 13b08c85440fe..6943f6c6395c4 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -214,6 +214,11 @@ lpfc_dev_loss_tmo_callbk(struct fc_rport *rport) if (ndlp->nlp_state == NLP_STE_MAPPED_NODE) return; + /* check for recovered fabric node */ + if (ndlp->nlp_state == NLP_STE_UNMAPPED_NODE && + ndlp->nlp_DID == Fabric_DID) + return; + if (rport->port_name != wwn_to_u64(ndlp->nlp_portname.u.wwn)) lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT, "6789 rport name %llx != node port name %llx", @@ -546,6 +551,9 @@ lpfc_dev_loss_tmo_handler(struct lpfc_nodelist *ndlp) ndlp->nlp_DID, kref_read(&ndlp->kref), ndlp, ndlp->nlp_flag, vport->port_state); + spin_lock_irqsave(&ndlp->lock, iflags); + ndlp->nlp_flag &= ~NLP_IN_DEV_LOSS; + spin_unlock_irqrestore(&ndlp->lock, iflags); return fcf_inuse; } From ede596b1434b57c0b3fd5c02b326efe5c54f6e48 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Fri, 28 Jun 2024 10:20:08 -0700 Subject: [PATCH 5/8] scsi: lpfc: Handle mailbox timeouts in lpfc_get_sfp_info The MBX_TIMEOUT return code is not handled in lpfc_get_sfp_info and the routine unconditionally frees submitted mailbox commands regardless of return status. The issue is that for MBX_TIMEOUT cases, when firmware returns SFP information at a later time, that same mailbox memory region references previously freed memory in its cmpl routine. Fix by adding checks for the MBX_TIMEOUT return code. During mailbox resource cleanup, check the mbox flag to make sure that the wait did not timeout. If the MBOX_WAKE flag is not set, then do not free the resources because it will be freed when firmware completes the mailbox at a later time in its cmpl routine. Also, increase the timeout from 30 to 60 seconds to accommodate boot scripts requiring longer timeouts. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240628172011.25921-6-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_els.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index b27ebb574bfbb..929cbfc95163b 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -7302,13 +7302,13 @@ int lpfc_get_sfp_info_wait(struct lpfc_hba *phba, mbox->u.mqe.un.mem_dump_type3.addr_hi = putPaddrHigh(mp->phys); } mbox->vport = phba->pport; - - rc = lpfc_sli_issue_mbox_wait(phba, mbox, 30); + rc = lpfc_sli_issue_mbox_wait(phba, mbox, LPFC_MBOX_SLI4_CONFIG_TMO); if (rc == MBX_NOT_FINISHED) { rc = 1; goto error; } - + if (rc == MBX_TIMEOUT) + goto error; if (phba->sli_rev == LPFC_SLI_REV4) mp = mbox->ctx_buf; else @@ -7361,7 +7361,10 @@ int lpfc_get_sfp_info_wait(struct lpfc_hba *phba, mbox->u.mqe.un.mem_dump_type3.addr_hi = putPaddrHigh(mp->phys); } - rc = lpfc_sli_issue_mbox_wait(phba, mbox, 30); + rc = lpfc_sli_issue_mbox_wait(phba, mbox, LPFC_MBOX_SLI4_CONFIG_TMO); + + if (rc == MBX_TIMEOUT) + goto error; if (bf_get(lpfc_mqe_status, &mbox->u.mqe)) { rc = 1; goto error; @@ -7372,8 +7375,10 @@ int lpfc_get_sfp_info_wait(struct lpfc_hba *phba, DMP_SFF_PAGE_A2_SIZE); error: - mbox->ctx_buf = mpsave; - lpfc_mbox_rsrc_cleanup(phba, mbox, MBOX_THD_UNLOCKED); + if (mbox->mbox_flag & LPFC_MBX_WAKE) { + mbox->ctx_buf = mpsave; + lpfc_mbox_rsrc_cleanup(phba, mbox, MBOX_THD_UNLOCKED); + } return rc; From f65f31ac120bd2c308bf16b18e5ee74445c06446 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Fri, 28 Jun 2024 10:20:09 -0700 Subject: [PATCH 6/8] scsi: lpfc: Fix incorrect request len mbox field when setting trunking via sysfs When setting trunk modes through sysfs, the SLI_CONFIG mailbox command's command payload length is incorrectly hardcoded to 12 bytes. SLI_CONFIG's payload length field should be specified large enough to encompass both the submailbox command header and the submailbox request itself. Thus, replace the hardcoded 12 bytes with a clearer calculation by way of sizeof(struct lpfc_mbx_set_trunk_mode) - sizeof(struct lpfc_sli4_cfg_mhdr). Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240628172011.25921-7-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_attr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index a46c73e8d7c40..62e517719e8ff 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -1831,6 +1831,7 @@ static int lpfc_set_trunking(struct lpfc_hba *phba, char *buff_out) { LPFC_MBOXQ_t *mbox = NULL; + u32 payload_len; unsigned long val = 0; char *pval = NULL; int rc = 0; @@ -1869,9 +1870,11 @@ lpfc_set_trunking(struct lpfc_hba *phba, char *buff_out) if (!mbox) return -ENOMEM; + payload_len = sizeof(struct lpfc_mbx_set_trunk_mode) - + sizeof(struct lpfc_sli4_cfg_mhdr); lpfc_sli4_config(phba, mbox, LPFC_MBOX_SUBSYSTEM_FCOE, LPFC_MBOX_OPCODE_FCOE_FC_SET_TRUNK_MODE, - 12, LPFC_SLI4_MBX_EMBED); + payload_len, LPFC_SLI4_MBX_EMBED); bf_set(lpfc_mbx_set_trunk_mode, &mbox->u.mqe.un.set_trunk_mode, From 8bc7c617642db6d8d20ee671fb6c4513017e7a7e Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Fri, 28 Jun 2024 10:20:10 -0700 Subject: [PATCH 7/8] scsi: lpfc: Revise lpfc_prep_embed_io routine with proper endian macro usages On big endian architectures, it is possible to run into a memory out of bounds pointer dereference when FCP targets are zoned. In lpfc_prep_embed_io, the memcpy(ptr, fcp_cmnd, sgl->sge_len) is referencing a little endian formatted sgl->sge_len value. So, the memcpy can cause big endian systems to crash. Redefine the *sgl ptr as a struct sli4_sge_le to make it clear that we are referring to a little endian formatted data structure. And, update the routine with proper le32_to_cpu macro usages. Fixes: af20bb73ac25 ("scsi: lpfc: Add support for 32 byte CDBs") Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240628172011.25921-8-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_sli.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 8bfac91433147..88debef2fb6db 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -10579,10 +10579,11 @@ lpfc_prep_embed_io(struct lpfc_hba *phba, struct lpfc_io_buf *lpfc_cmd) { struct lpfc_iocbq *piocb = &lpfc_cmd->cur_iocbq; union lpfc_wqe128 *wqe = &lpfc_cmd->cur_iocbq.wqe; - struct sli4_sge *sgl; + struct sli4_sge_le *sgl; + u32 type_size; /* 128 byte wqe support here */ - sgl = (struct sli4_sge *)lpfc_cmd->dma_sgl; + sgl = (struct sli4_sge_le *)lpfc_cmd->dma_sgl; if (phba->fcp_embed_io) { struct fcp_cmnd *fcp_cmnd; @@ -10591,9 +10592,9 @@ lpfc_prep_embed_io(struct lpfc_hba *phba, struct lpfc_io_buf *lpfc_cmd) fcp_cmnd = lpfc_cmd->fcp_cmnd; /* Word 0-2 - FCP_CMND */ - wqe->generic.bde.tus.f.bdeFlags = - BUFF_TYPE_BDE_IMMED; - wqe->generic.bde.tus.f.bdeSize = sgl->sge_len; + type_size = le32_to_cpu(sgl->sge_len); + type_size |= ULP_BDE64_TYPE_BDE_IMMED; + wqe->generic.bde.tus.w = type_size; wqe->generic.bde.addrHigh = 0; wqe->generic.bde.addrLow = 72; /* Word 18 */ @@ -10602,13 +10603,13 @@ lpfc_prep_embed_io(struct lpfc_hba *phba, struct lpfc_io_buf *lpfc_cmd) /* Word 18-29 FCP CMND Payload */ ptr = &wqe->words[18]; - memcpy(ptr, fcp_cmnd, sgl->sge_len); + lpfc_sli_pcimem_bcopy(fcp_cmnd, ptr, le32_to_cpu(sgl->sge_len)); } else { /* Word 0-2 - Inline BDE */ wqe->generic.bde.tus.f.bdeFlags = BUFF_TYPE_BDE_64; - wqe->generic.bde.tus.f.bdeSize = sgl->sge_len; - wqe->generic.bde.addrHigh = sgl->addr_hi; - wqe->generic.bde.addrLow = sgl->addr_lo; + wqe->generic.bde.tus.f.bdeSize = le32_to_cpu(sgl->sge_len); + wqe->generic.bde.addrHigh = le32_to_cpu(sgl->addr_hi); + wqe->generic.bde.addrLow = le32_to_cpu(sgl->addr_lo); /* Word 10 */ bf_set(wqe_dbde, &wqe->generic.wqe_com, 1); From 41972df1a56bd926a14d9670936f5278cf351968 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Fri, 28 Jun 2024 10:20:11 -0700 Subject: [PATCH 8/8] scsi: lpfc: Update lpfc version to 14.4.0.3 Update lpfc version to 14.4.0.3. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20240628172011.25921-9-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h index f06087e478597..7ac9ef281881c 100644 --- a/drivers/scsi/lpfc/lpfc_version.h +++ b/drivers/scsi/lpfc/lpfc_version.h @@ -20,7 +20,7 @@ * included with this package. * *******************************************************************/ -#define LPFC_DRIVER_VERSION "14.4.0.2" +#define LPFC_DRIVER_VERSION "14.4.0.3" #define LPFC_DRIVER_NAME "lpfc" /* Used for SLI 2/3 */