From c5c9f25b98a568451d665afe4aeefe17bf9f2995 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Tue, 24 Nov 2015 09:55:05 -0700 Subject: [PATCH 1/4] NVMe: default to 4k device page size We received a bug report recently when DDW (64-bit direct DMA on Power) is not enabled for NVMe devices. In that case, we fall back to 32-bit DMA via the IOMMU, which is always done via 4K TCEs (Translation Control Entries). The NVMe device driver, though, assumes that the DMA alignment for the PRP entries will match the device's page size, and that the DMA aligment matches the kernel's page aligment. On Power, the the IOMMU page size, as mentioned above, can be 4K, while the device can have a page size of 8K, while the kernel has a page size of 64K. This eventually trips the BUG_ON in nvme_setup_prps(), as we have a 'dma_len' that is a multiple of 4K but not 8K (e.g., 0xF000). In this particular case of page sizes, we clearly want to use the IOMMU's page size in the driver. And generally, the NVMe driver in this function should be using the IOMMU's page size for the default device page size, rather than the kernel's page size. There is not currently an API to obtain the IOMMU's page size across all architectures and in the interest of a stop-gap fix to this functional issue, default the NVMe device page size to 4K, with the intent of adding such an API and implementation across all architectures in the next merge window. With the functionally equivalent v3 of this patch, our hardware test exerciser survives when using 32-bit DMA; without the patch, the kernel will BUG within a few minutes. Signed-off-by: Nishanth Aravamudan Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 930042fa2d694..d9d6229e9f3f0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1728,9 +1728,13 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u32 aqa; u64 cap = lo_hi_readq(&dev->bar->cap); struct nvme_queue *nvmeq; - unsigned page_shift = PAGE_SHIFT; + /* + * default to a 4K page size, with the intention to update this + * path in the future to accomodate architectures with differing + * kernel and IO page sizes. + */ + unsigned page_shift = 12; unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; - unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; if (page_shift < dev_page_min) { dev_err(dev->dev, @@ -1739,13 +1743,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) 1 << page_shift); return -ENODEV; } - if (page_shift > dev_page_max) { - dev_info(dev->dev, - "Device maximum page size (%u) smaller than " - "host (%u); enabling work-around\n", - 1 << dev_page_max, 1 << page_shift); - page_shift = dev_page_max; - } dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ? NVME_CAP_NSSRC(cap) : 0; From bf508e910b02a6107a5aa054e03c6fc8a65dae1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2015 07:58:31 +0200 Subject: [PATCH 2/4] nvme: add missing unmaps in nvme_queue_rq When we fail various metadata related operations in nvme_queue_rq we need to unmap the data SGL. Cc: stable@vger.kernel.org Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d9d6229e9f3f0..f3b53af789efc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -896,19 +896,28 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto retry_cmd; } if (blk_integrity_rq(req)) { - if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) + if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, + dma_dir); goto error_cmd; + } sg_init_table(iod->meta_sg, 1); if (blk_rq_map_integrity_sg( - req->q, req->bio, iod->meta_sg) != 1) + req->q, req->bio, iod->meta_sg) != 1) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, + dma_dir); goto error_cmd; + } if (rq_data_dir(req)) nvme_dif_remap(req, nvme_dif_prep); - if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) + if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, + dma_dir); goto error_cmd; + } } } From 55ce0da1da287822e5ffb5fcd6e357180d5ba4cd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Oct 2015 20:47:04 +0800 Subject: [PATCH 3/4] block: fix blk_abort_request for blk-mq drivers We only added the request to the request list for the !blk-mq case, so we should only delete it in that case as well. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-timeout.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 246dfb16c3d98..aa40aa93381b6 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -158,11 +158,13 @@ void blk_abort_request(struct request *req) { if (blk_mark_rq_complete(req)) return; - blk_delete_timer(req); - if (req->q->mq_ops) + + if (req->q->mq_ops) { blk_mq_rq_timed_out(req, false); - else + } else { + blk_delete_timer(req); blk_rq_timed_out(req); + } } EXPORT_SYMBOL_GPL(blk_abort_request); From dcd8376c369fa8fde8269e721b14f50475dd397b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Nov 2015 10:12:54 -0700 Subject: [PATCH 4/4] Revert "blk-flush: Queue through IO scheduler when flush not required" This reverts commit 1b2ff19e6a957b1ef0f365ad331b608af80e932e. Jan writes: -- Thanks for report! After some investigation I found out we allocate elevator specific data in __get_request() only for non-flush requests. And this is actually required since the flush machinery uses the space in struct request for something else. Doh. So my patch is just wrong and not easy to fix since at the time __get_request() is called we are not sure whether the flush machinery will be used in the end. Jens, please revert 1b2ff19e6a957b1ef0f365ad331b608af80e932e. Thanks! I'm somewhat surprised that you can reliably hit the race where flushing gets disabled for the device just while the request is in flight. But I guess during boot it makes some sense. -- So let's just revert it, we can fix the queue run manually after the fact. This race is rare enough that it didn't trigger in testing, it requires the specific disable-while-in-flight scenario to trigger. --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index c81d56ec308f9..9c423e53324a2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -422,7 +422,7 @@ void blk_insert_flush(struct request *rq) if (q->mq_ops) { blk_mq_insert_request(rq, false, false, true); } else - q->elevator->type->ops.elevator_add_req_fn(q, rq); + list_add_tail(&rq->queuelist, &q->queue_head); return; }