From 3f674e7b670b7b7d9261935820e4eba3c059f835 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 6 Mar 2025 14:25:57 -0800 Subject: [PATCH 1/3] nvme-pci: fix stuck reset on concurrent DPC and HP The PCIe error handling has the nvme driver quiesce the device, attempt to restart it, then wait for that restart to complete. A PCIe DPC event also toggles the PCIe link. If the slot doesn't have out-of-band presence detection, this will trigger a pciehp re-enumeration. The error handling that calls nvme_error_resume is holding the device lock while this happens. This lock blocks pciehp's request to disconnect the driver from proceeding. Meanwhile the nvme's reset can't make forward progress because its device isn't there anymore with outstanding IO, and the timeout handler won't do anything to fix it because the device is undergoing error handling. End result: deadlocked. Fix this by having the timeout handler short cut the disabling for a disconnected PCIe device. The downside is that we're relying on an IO timeout to clean up this mess, which could be a minute by default. Tested-by: Nilay Shroff Reviewed-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 640590b21728..e59aad269abf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1411,9 +1411,20 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd = { }; + struct pci_dev *pdev = to_pci_dev(dev->dev); u32 csts = readl(dev->bar + NVME_REG_CSTS); u8 opcode; + /* + * Shutdown the device immediately if we see it is disconnected. This + * unblocks PCIe error handling if the nvme driver is waiting in + * error_resume for a device that has been removed. We can't unbind the + * driver while the driver's error callback is waiting to complete, so + * we're relying on a timeout to break that deadlock if a removal + * occurs while reset work is running. + */ + if (pci_dev_is_disconnected(pdev)) + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); if (nvme_state_terminal(&dev->ctrl)) goto disable; @@ -1421,7 +1432,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) * the recovery mechanism will surely fail. */ mb(); - if (pci_channel_offline(to_pci_dev(dev->dev))) + if (pci_channel_offline(pdev)) return BLK_EH_RESET_TIMER; /* From bf9b8020a80d32f6aa80591297a087d0519dc931 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 6 Mar 2025 17:55:48 +0900 Subject: [PATCH 2/3] nvmet: pci-epf: Set NVMET_PCI_EPF_Q_LIVE when a queue is fully created The function nvmet_pci_epf_create_sq() use test_and_set_bit() to check that a submission queue is not already live and if not, set the NVMET_PCI_EPF_Q_LIVE queue flag to declare the sq live (ready to use). However, this is done on entry to the function, before the submission queue is actually fully initialized and ready to use. This creates a race situation with the function nvmet_pci_epf_poll_sqs_work() which looks at the NVMET_PCI_EPF_Q_LIVE queue flag to poll the submission queue when it is live. This race can lead to invalid DMA transfers if nvmet_pci_epf_poll_sqs_work() runs after the NVMET_PCI_EPF_Q_LIVE flag is set but before setting the sq pci address and doorbell ofset. Avoid this race by only testing the NVMET_PCI_EPF_Q_LIVE flag on entry to nvmet_pci_epf_create_sq() and setting it after the submission queue is fully setup before nvmet_pci_epf_create_sq() returns success. Since the function nvmet_pci_epf_create_cq() also has the same racy flag setting pattern, also make a similar change in that function. Fixes: 0faa0fe6f90e ("nvmet: New NVMe PCI endpoint function target driver") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/pci-epf.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index 565d2bd36dcd..d55ad334670c 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1265,7 +1265,7 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid]; u16 status; - if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) + if (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; if (!(flags & NVME_QUEUE_PHYS_CONTIG)) @@ -1300,6 +1300,8 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, if (status != NVME_SC_SUCCESS) goto err; + set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags); + dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n", cqid, qsize, cq->qes, cq->vector); @@ -1307,7 +1309,6 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, err: clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags); - clear_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags); return status; } @@ -1333,7 +1334,7 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl, struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid]; u16 status; - if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags)) + if (test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags)) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; if (!(flags & NVME_QUEUE_PHYS_CONTIG)) @@ -1355,7 +1356,7 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl, status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth); if (status != NVME_SC_SUCCESS) - goto out_clear_bit; + return status; sq->iod_wq = alloc_workqueue("sq%d_wq", WQ_UNBOUND, min_t(int, sq->depth, WQ_MAX_ACTIVE), sqid); @@ -1365,6 +1366,8 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl, goto out_destroy_sq; } + set_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags); + dev_dbg(ctrl->dev, "SQ[%u]: %u entries of %zu B\n", sqid, qsize, sq->qes); @@ -1372,8 +1375,6 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl, out_destroy_sq: nvmet_sq_destroy(&sq->nvme_sq); -out_clear_bit: - clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags); return status; } From 39393f5c5c795992507aa5005a9d58396a5b07f1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 6 Mar 2025 17:55:49 +0900 Subject: [PATCH 3/3] nvmet: pci-epf: Do not add an IRQ vector if not needed The function nvmet_pci_epf_create_cq() always unconditionally calls nvmet_pci_epf_add_irq_vector() to add an IRQ vector for a completion queue. But this is not correct if the host requested the creation of a completion queue for polling, without an IRQ vector specified (i.e. the flag NVME_CQ_IRQ_ENABLED is not set). Fix this by calling nvmet_pci_epf_add_irq_vector() and setting the queue flag NVMET_PCI_EPF_Q_IRQ_ENABLED for the cq only if NVME_CQ_IRQ_ENABLED is set. While at it, also fix the error path to add the missing removal of the added IRQ vector if nvmet_cq_create() fails. Fixes: 0faa0fe6f90e ("nvmet: New NVMe PCI endpoint function target driver") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/pci-epf.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index d55ad334670c..b1e31483f157 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1271,9 +1271,6 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, if (!(flags & NVME_QUEUE_PHYS_CONTIG)) return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR; - if (flags & NVME_CQ_IRQ_ENABLED) - set_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags); - cq->pci_addr = pci_addr; cq->qid = cqid; cq->depth = qsize + 1; @@ -1290,10 +1287,11 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, cq->qes = ctrl->io_cqes; cq->pci_size = cq->qes * cq->depth; - cq->iv = nvmet_pci_epf_add_irq_vector(ctrl, vector); - if (!cq->iv) { - status = NVME_SC_INTERNAL | NVME_STATUS_DNR; - goto err; + if (flags & NVME_CQ_IRQ_ENABLED) { + cq->iv = nvmet_pci_epf_add_irq_vector(ctrl, vector); + if (!cq->iv) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + set_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags); } status = nvmet_cq_create(tctrl, &cq->nvme_cq, cqid, cq->depth); @@ -1308,7 +1306,8 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, return NVME_SC_SUCCESS; err: - clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags); + if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) + nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); return status; }