From de7b75d82f70c5469675b99ad632983c50b6f7e7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 9 Nov 2018 15:58:40 -0700
Subject: [PATCH 1/6] floppy: fix race condition in __floppy_read_block_0()

LKP recently reported a hang at bootup in the floppy code:

[  245.678853] INFO: task mount:580 blocked for more than 120 seconds.
[  245.679906]       Tainted: G                T 4.19.0-rc6-00172-ga9f38e1 #1
[  245.680959] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  245.682181] mount           D 6372   580      1 0x00000004
[  245.683023] Call Trace:
[  245.683425]  __schedule+0x2df/0x570
[  245.683975]  schedule+0x2d/0x80
[  245.684476]  schedule_timeout+0x19d/0x330
[  245.685090]  ? wait_for_common+0xa5/0x170
[  245.685735]  wait_for_common+0xac/0x170
[  245.686339]  ? do_sched_yield+0x90/0x90
[  245.686935]  wait_for_completion+0x12/0x20
[  245.687571]  __floppy_read_block_0+0xfb/0x150
[  245.688244]  ? floppy_resume+0x40/0x40
[  245.688844]  floppy_revalidate+0x20f/0x240
[  245.689486]  check_disk_change+0x43/0x60
[  245.690087]  floppy_open+0x1ea/0x360
[  245.690653]  __blkdev_get+0xb4/0x4d0
[  245.691212]  ? blkdev_get+0x1db/0x370
[  245.691777]  blkdev_get+0x1f3/0x370
[  245.692351]  ? path_put+0x15/0x20
[  245.692871]  ? lookup_bdev+0x4b/0x90
[  245.693539]  blkdev_get_by_path+0x3d/0x80
[  245.694165]  mount_bdev+0x2a/0x190
[  245.694695]  squashfs_mount+0x10/0x20
[  245.695271]  ? squashfs_alloc_inode+0x30/0x30
[  245.695960]  mount_fs+0xf/0x90
[  245.696451]  vfs_kern_mount+0x43/0x130
[  245.697036]  do_mount+0x187/0xc40
[  245.697563]  ? memdup_user+0x28/0x50
[  245.698124]  ksys_mount+0x60/0xc0
[  245.698639]  sys_mount+0x19/0x20
[  245.699167]  do_int80_syscall_32+0x61/0x130
[  245.699813]  entry_INT80_32+0xc7/0xc7

showing that we never complete that read request. The reason is that
the completion setup is racy - it initializes the completion event
AFTER submitting the IO, which means that the IO could complete
before/during the init. If it does, we are passing garbage to
complete() and we may sleep forever waiting for the event to
occur.

Fixes: 7b7b68bba5ef ("floppy: bail out in open() if drive is not responding to block0 read")
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/floppy.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index a8cfa011c2848..fb23578e9a416 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4148,10 +4148,11 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
 	bio.bi_end_io = floppy_rb0_cb;
 	bio_set_op_attrs(&bio, REQ_OP_READ, 0);
 
+	init_completion(&cbdata.complete);
+
 	submit_bio(&bio);
 	process_fd_request();
 
-	init_completion(&cbdata.complete);
 	wait_for_completion(&cbdata.complete);
 
 	__free_page(page);

From 18e962ac0781bcb70d433de3b2a325ff792b4288 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 12 Nov 2018 00:08:46 -0800
Subject: [PATCH 2/6] kyber: fix wrong strlcpy() size in trace_kyber_latency()

When copying to the latency type, we should be passing LATENCY_TYPE_LEN,
not DOMAIN_LEN (this isn't a problem in practice because we only pass
"total" or "I/O"). Fix it by changing all of the strlcpy() calls to use
sizeof().

Fixes: 6c3b7af1c975 ("kyber: add tracepoints")
Reported-by: Jordan Glover <Golden_Miller83@protonmail.ch>
Tested-by: Jordan Glover <Golden_Miller83@protonmail.ch>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/kyber.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h
index a9834c37ac400..c0e7d24ca2568 100644
--- a/include/trace/events/kyber.h
+++ b/include/trace/events/kyber.h
@@ -31,8 +31,8 @@ TRACE_EVENT(kyber_latency,
 
 	TP_fast_assign(
 		__entry->dev		= disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)));
-		strlcpy(__entry->domain, domain, DOMAIN_LEN);
-		strlcpy(__entry->type, type, DOMAIN_LEN);
+		strlcpy(__entry->domain, domain, sizeof(__entry->domain));
+		strlcpy(__entry->type, type, sizeof(__entry->type));
 		__entry->percentile	= percentile;
 		__entry->numerator	= numerator;
 		__entry->denominator	= denominator;
@@ -60,7 +60,7 @@ TRACE_EVENT(kyber_adjust,
 
 	TP_fast_assign(
 		__entry->dev		= disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)));
-		strlcpy(__entry->domain, domain, DOMAIN_LEN);
+		strlcpy(__entry->domain, domain, sizeof(__entry->domain));
 		__entry->depth		= depth;
 	),
 
@@ -82,7 +82,7 @@ TRACE_EVENT(kyber_throttled,
 
 	TP_fast_assign(
 		__entry->dev		= disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)));
-		strlcpy(__entry->domain, domain, DOMAIN_LEN);
+		strlcpy(__entry->domain, domain, sizeof(__entry->domain));
 	),
 
 	TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev),

From ca474b73896bf6e0c1eb8787eb217b0f80221610 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.com>
Date: Mon, 12 Nov 2018 10:35:25 -0700
Subject: [PATCH 3/6] block: copy ioprio in __bio_clone_fast() and bounce

We need to copy the io priority, too; otherwise the clone will run
with a different priority than the original one.

Fixes: 43b62ce3ff0a ("block: move bio io prio to a new field")
Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jean Delvare <jdelvare@suse.de>

Fixed up subject, and ordered stores.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c    | 1 +
 block/bounce.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index a50d59236b197..4f4d9884443b6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -605,6 +605,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	if (bio_flagged(bio_src, BIO_THROTTLED))
 		bio_set_flag(bio, BIO_THROTTLED);
 	bio->bi_opf = bio_src->bi_opf;
+	bio->bi_ioprio = bio_src->bi_ioprio;
 	bio->bi_write_hint = bio_src->bi_write_hint;
 	bio->bi_iter = bio_src->bi_iter;
 	bio->bi_io_vec = bio_src->bi_io_vec;
diff --git a/block/bounce.c b/block/bounce.c
index 36869afc258cc..559c55bda040e 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -248,6 +248,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
 		return NULL;
 	bio->bi_disk		= bio_src->bi_disk;
 	bio->bi_opf		= bio_src->bi_opf;
+	bio->bi_ioprio		= bio_src->bi_ioprio;
 	bio->bi_write_hint	= bio_src->bi_write_hint;
 	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
 	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;

From 410b5c7b48368317af95f0113692561d01d8144e Mon Sep 17 00:00:00 2001
From: Diego Viola <diego.viola@gmail.com>
Date: Mon, 12 Nov 2018 17:22:52 -0200
Subject: [PATCH 4/6] libata: blacklist SAMSUNG MZ7TD256HAFV-000L9 SSD

med_power_with_dipm still causes freezes after updating the firmware to
the latest version (DXT04L5Q).

Set model_rev to NULL and blacklist the device.

Cc: stable@vger.kernel.org
Signed-off-by: Diego Viola <diego.viola@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ata/libata-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 6e594644cb1d3..a7f5202a48152 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4553,7 +4553,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	/* These specific Samsung models/firmware-revs do not handle LPM well */
 	{ "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_HORKAGE_NOLPM, },
 	{ "SAMSUNG SSD PM830 mSATA *",  "CXM13D1Q", ATA_HORKAGE_NOLPM, },
-	{ "SAMSUNG MZ7TD256HAFV-000L9", "DXT02L5Q", ATA_HORKAGE_NOLPM, },
+	{ "SAMSUNG MZ7TD256HAFV-000L9", NULL,       ATA_HORKAGE_NOLPM, },
 
 	/* devices that don't properly handle queued TRIM commands */
 	{ "Micron_M500IT_*",		"MU01",	ATA_HORKAGE_NO_NCQ_TRIM |

From 4800bf7bc8c725e955fcbc6191cc872f43f506d3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 14 Nov 2018 08:17:18 -0700
Subject: [PATCH 5/6] block: fix 32 bit overflow in __blkdev_issue_discard()

A discard cleanup merged into 4.20-rc2 causes fstests xfs/259 to
fall into an endless loop in the discard code. The test is creating
a device that is exactly 2^32 sectors in size to test mkfs boundary
conditions around the 32 bit sector overflow region.

mkfs issues a discard for the entire device size by default, and
hence this throws a sector count of 2^32 into
blkdev_issue_discard(). It takes the number of sectors to discard as
a sector_t - a 64 bit value.

The commit ba5d73851e71 ("block: cleanup __blkdev_issue_discard")
takes this sector count and casts it to a 32 bit value before
comapring it against the maximum allowed discard size the device
has. This truncates away the upper 32 bits, and so if the lower 32
bits of the sector count is zero, it starts issuing discards of
length 0. This causes the code to fall into an endless loop, issuing
a zero length discards over and over again on the same sector.

Fixes: ba5d73851e71 ("block: cleanup __blkdev_issue_discard")
Tested-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>

Killed pointless WARN_ON().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-lib.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index e8b3bb9bf3759..5f2c429d43784 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -55,9 +55,11 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -EINVAL;
 
 	while (nr_sects) {
-		unsigned int req_sects = min_t(unsigned int, nr_sects,
+		sector_t req_sects = min_t(sector_t, nr_sects,
 				bio_allowed_max_sectors(q));
 
+		WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
+
 		bio = blk_next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);

From 8dc765d438f1e42b3e8227b3b09fad7d73f4ec9a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 14 Nov 2018 16:25:51 +0800
Subject: [PATCH 6/6] SCSI: fix queue cleanup race before queue initialization
 is done

c2856ae2f315d ("blk-mq: quiesce queue before freeing queue") has
already fixed this race, however the implied synchronize_rcu()
in blk_mq_quiesce_queue() can slow down LUN probe a lot, so caused
performance regression.

Then 1311326cf4755c7 ("blk-mq: avoid to synchronize rcu inside blk_cleanup_queue()")
tried to quiesce queue for avoiding unnecessary synchronize_rcu()
only when queue initialization is done, because it is usual to see
lots of inexistent LUNs which need to be probed.

However, turns out it isn't safe to quiesce queue only when queue
initialization is done. Because when one SCSI command is completed,
the user of sending command can be waken up immediately, then the
scsi device may be removed, meantime the run queue in scsi_end_request()
is still in-progress, so kernel panic can be caused.

In Red Hat QE lab, there are several reports about this kind of kernel
panic triggered during kernel booting.

This patch tries to address the issue by grabing one queue usage
counter during freeing one request and the following run queue.

Fixes: 1311326cf4755c7 ("blk-mq: avoid to synchronize rcu inside blk_cleanup_queue()")
Cc: Andrew Jones <drjones@redhat.com>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: linux-scsi@vger.kernel.org
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: James E.J. Bottomley <jejb@linux.vnet.ibm.com>
Cc: stable <stable@vger.kernel.org>
Cc: jianchao.wang <jianchao.w.wang@oracle.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c        | 5 ++---
 drivers/scsi/scsi_lib.c | 8 ++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ce12515f9b9b9..deb56932f8c46 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -798,9 +798,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	 * dispatch may still be in-progress since we dispatch requests
 	 * from more than one contexts.
 	 *
-	 * No need to quiesce queue if it isn't initialized yet since
-	 * blk_freeze_queue() should be enough for cases of passthrough
-	 * request.
+	 * We rely on driver to deal with the race in case that queue
+	 * initialization isn't done.
 	 */
 	if (q->mq_ops && blk_queue_init_done(q))
 		blk_mq_quiesce_queue(q);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c7fccbb8f5545..fa6e0c3b3aa67 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -697,6 +697,12 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
 		 */
 		scsi_mq_uninit_cmd(cmd);
 
+		/*
+		 * queue is still alive, so grab the ref for preventing it
+		 * from being cleaned up during running queue.
+		 */
+		percpu_ref_get(&q->q_usage_counter);
+
 		__blk_mq_end_request(req, error);
 
 		if (scsi_target(sdev)->single_lun ||
@@ -704,6 +710,8 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
 			kblockd_schedule_work(&sdev->requeue_work);
 		else
 			blk_mq_run_hw_queues(q, true);
+
+		percpu_ref_put(&q->q_usage_counter);
 	} else {
 		unsigned long flags;