From 3db70a853202c252a8ebefa71ccb088ad149cdd2 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Wed, 25 Nov 2015 17:52:55 -0500 Subject: [PATCH 01/25] xen/blkfront: realloc ring info in blkif_resume Need to reallocate ring info in the resume path, because info->rinfo was freed in blkif_free(). And 'multi-queue-max-queues' backend reports may have been changed. Signed-off-by: Bob Liu Reported-and-Tested-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 74 ++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8a8dc91c39f72..83eb9e6bf8b06 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1873,6 +1873,43 @@ static int talk_to_blkback(struct xenbus_device *dev, return err; } +static int negotiate_mq(struct blkfront_info *info) +{ + unsigned int backend_max_queues = 0; + int err; + unsigned int i; + + BUG_ON(info->nr_rings); + + /* Check if backend supports multiple queues. */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "multi-queue-max-queues", "%u", &backend_max_queues); + if (err < 0) + backend_max_queues = 1; + + info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); + /* We need at least one ring. */ + if (!info->nr_rings) + info->nr_rings = 1; + + info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL); + if (!info->rinfo) { + xenbus_dev_fatal(info->xbdev, -ENOMEM, "allocating ring_info structure"); + return -ENOMEM; + } + + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[i]; + INIT_LIST_HEAD(&rinfo->indirect_pages); + INIT_LIST_HEAD(&rinfo->grants); + rinfo->dev_info = info; + INIT_WORK(&rinfo->work, blkif_restart_queue); + spin_lock_init(&rinfo->ring_lock); + } + return 0; +} /** * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffer for communication with the backend, and @@ -1883,9 +1920,7 @@ static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { int err, vdevice; - unsigned int r_index; struct blkfront_info *info; - unsigned int backend_max_queues = 0; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -1936,33 +1971,10 @@ static int blkfront_probe(struct xenbus_device *dev, } info->xbdev = dev; - /* Check if backend supports multiple queues. */ - err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, - "multi-queue-max-queues", "%u", &backend_max_queues); - if (err < 0) - backend_max_queues = 1; - - info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); - /* We need at least one ring. */ - if (!info->nr_rings) - info->nr_rings = 1; - - info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL); - if (!info->rinfo) { - xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure"); + err = negotiate_mq(info); + if (err) { kfree(info); - return -ENOMEM; - } - - for (r_index = 0; r_index < info->nr_rings; r_index++) { - struct blkfront_ring_info *rinfo; - - rinfo = &info->rinfo[r_index]; - INIT_LIST_HEAD(&rinfo->indirect_pages); - INIT_LIST_HEAD(&rinfo->grants); - rinfo->dev_info = info; - INIT_WORK(&rinfo->work, blkif_restart_queue); - spin_lock_init(&rinfo->ring_lock); + return err; } mutex_init(&info->mutex); @@ -2123,12 +2135,16 @@ static int blkif_recover(struct blkfront_info *info) static int blkfront_resume(struct xenbus_device *dev) { struct blkfront_info *info = dev_get_drvdata(&dev->dev); - int err; + int err = 0; dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); + err = negotiate_mq(info); + if (err) + return err; + err = talk_to_blkback(dev, info); /* From 7ddc971f86aa0a4cee9f6886c356a052461957ae Mon Sep 17 00:00:00 2001 From: Mike Krinkin Date: Sat, 30 Jan 2016 19:09:59 +0300 Subject: [PATCH 02/25] block: fix use-after-free in dio_bio_complete kasan reported the following error when i ran xfstest: [ 701.826854] ================================================================== [ 701.826864] BUG: KASAN: use-after-free in dio_bio_complete+0x41a/0x600 at addr ffff880080b95f94 [ 701.826870] Read of size 4 by task loop2/3874 [ 701.826879] page:ffffea000202e540 count:0 mapcount:0 mapping: (null) index:0x0 [ 701.826890] flags: 0x100000000000000() [ 701.826895] page dumped because: kasan: bad access detected [ 701.826904] CPU: 3 PID: 3874 Comm: loop2 Tainted: G B W L 4.5.0-rc1-next-20160129 #83 [ 701.826910] Hardware name: LENOVO 23205NG/23205NG, BIOS G2ET95WW (2.55 ) 07/09/2013 [ 701.826917] ffff88008fadf800 ffff88008fadf758 ffffffff81ca67bb 0000000041b58ab3 [ 701.826941] ffffffff830d1e74 ffffffff81ca6724 ffff88008fadf748 ffffffff8161c05c [ 701.826963] 0000000000000282 ffff88008fadf800 ffffed0010172bf2 ffffea000202e540 [ 701.826987] Call Trace: [ 701.826997] [] dump_stack+0x97/0xdc [ 701.827005] [] ? _atomic_dec_and_lock+0xc4/0xc4 [ 701.827014] [] ? __dump_page+0x32c/0x490 [ 701.827023] [] kasan_report_error+0x5f3/0x8b0 [ 701.827033] [] ? dio_bio_complete+0x41a/0x600 [ 701.827040] [] __asan_report_load4_noabort+0x59/0x80 [ 701.827048] [] ? dio_bio_complete+0x41a/0x600 [ 701.827053] [] dio_bio_complete+0x41a/0x600 [ 701.827057] [] ? blk_queue_exit+0x108/0x270 [ 701.827060] [] dio_bio_end_aio+0xa0/0x4d0 [ 701.827063] [] ? dio_bio_complete+0x600/0x600 [ 701.827067] [] ? blk_account_io_completion+0x316/0x5d0 [ 701.827070] [] bio_endio+0x79/0x200 [ 701.827074] [] blk_update_request+0x1df/0xc50 [ 701.827078] [] blk_mq_end_request+0x57/0x120 [ 701.827081] [] __blk_mq_complete_request+0x310/0x590 [ 701.827084] [] ? set_next_entity+0x2f8/0x2ed0 [ 701.827088] [] ? put_prev_entity+0x22d/0x2a70 [ 701.827091] [] blk_mq_complete_request+0x5b/0x80 [ 701.827094] [] loop_queue_work+0x273/0x19d0 [ 701.827098] [] ? finish_task_switch+0x1c8/0x8e0 [ 701.827101] [] ? trace_hardirqs_on_caller+0x18/0x6c0 [ 701.827104] [] ? lo_read_simple+0x890/0x890 [ 701.827108] [] ? debug_check_no_locks_freed+0x350/0x350 [ 701.827111] [] ? __hrtick_start+0x130/0x130 [ 701.827115] [] ? __schedule+0x936/0x20b0 [ 701.827118] [] ? kthread_worker_fn+0x3ed/0x8d0 [ 701.827121] [] ? kthread_worker_fn+0x21d/0x8d0 [ 701.827125] [] ? trace_hardirqs_on_caller+0x18/0x6c0 [ 701.827128] [] kthread_worker_fn+0x2af/0x8d0 [ 701.827132] [] ? __init_kthread_worker+0x170/0x170 [ 701.827135] [] ? _raw_spin_unlock_irqrestore+0x36/0x60 [ 701.827138] [] ? __init_kthread_worker+0x170/0x170 [ 701.827141] [] ? __init_kthread_worker+0x170/0x170 [ 701.827144] [] kthread+0x24b/0x3a0 [ 701.827148] [] ? kthread_create_on_node+0x4c0/0x4c0 [ 701.827151] [] ? trace_hardirqs_on+0xd/0x10 [ 701.827155] [] ? do_group_exit+0xdd/0x350 [ 701.827158] [] ? kthread_create_on_node+0x4c0/0x4c0 [ 701.827161] [] ret_from_fork+0x3f/0x70 [ 701.827165] [] ? kthread_create_on_node+0x4c0/0x4c0 [ 701.827167] Memory state around the buggy address: [ 701.827170] ffff880080b95e80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 701.827172] ffff880080b95f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 701.827175] >ffff880080b95f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 701.827177] ^ [ 701.827179] ffff880080b96000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 701.827182] ffff880080b96080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 701.827183] ================================================================== The problem is that bio_check_pages_dirty calls bio_put, so we must not access bio fields after bio_check_pages_dirty. Fixes: 9b81c842355ac96097ba ("block: don't access bio->bi_error after bio_put()"). Signed-off-by: Mike Krinkin Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- fs/direct-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 1b2f7ffc8b841..d6a9012d42ad5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -472,8 +472,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) dio->io_error = -EIO; if (dio->is_async && dio->rw == READ && dio->should_dirty) { - bio_check_pages_dirty(bio); /* transfers ownership */ err = bio->bi_error; + bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; From a0c80efe5956ccce9fe7ae5c78542578c07bc20a Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 1 Feb 2016 11:19:17 +0100 Subject: [PATCH 03/25] floppy: fix lock_fdc() signal handling floppy_revalidate() doesn't perform any error handling on lock_fdc() result. lock_fdc() might actually be interrupted by a signal (it waits for fdc becoming non-busy interruptibly). In such case, floppy_revalidate() proceeds as if it had claimed the lock, but it fact it doesn't. In case of multiple threads trying to open("/dev/fdX"), this leads to serious corruptions all over the place, because all of a sudden there is no critical section protection (that'd otherwise be guaranteed by locked fd) whatsoever. While at this, fix the fact that the 'interruptible' parameter to lock_fdc() doesn't make any sense whatsoever, because we always wait interruptibly anyway. Most of the lock_fdc() callsites do properly handle error (and propagate EINTR), but floppy_revalidate() and floppy_check_events() don't. Fix this. Spotted by 'syzkaller' tool. Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Signed-off-by: Jiri Kosina --- drivers/block/floppy.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index fa9bb742df6e0..c1aacca88c8f6 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -866,7 +866,7 @@ static void set_fdc(int drive) } /* locks the driver */ -static int lock_fdc(int drive, bool interruptible) +static int lock_fdc(int drive) { if (WARN(atomic_read(&usage_count) == 0, "Trying to lock fdc while usage count=0\n")) @@ -2173,7 +2173,7 @@ static int do_format(int drive, struct format_descr *tmp_format_req) { int ret; - if (lock_fdc(drive, true)) + if (lock_fdc(drive)) return -EINTR; set_floppy(drive); @@ -2960,7 +2960,7 @@ static int user_reset_fdc(int drive, int arg, bool interruptible) { int ret; - if (lock_fdc(drive, interruptible)) + if (lock_fdc(drive)) return -EINTR; if (arg == FD_RESET_ALWAYS) @@ -3243,7 +3243,7 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g, if (!capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&open_lock); - if (lock_fdc(drive, true)) { + if (lock_fdc(drive)) { mutex_unlock(&open_lock); return -EINTR; } @@ -3263,7 +3263,7 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g, } else { int oldStretch; - if (lock_fdc(drive, true)) + if (lock_fdc(drive)) return -EINTR; if (cmd != FDDEFPRM) { /* notice a disk change immediately, else @@ -3349,7 +3349,7 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g) if (type) *g = &floppy_type[type]; else { - if (lock_fdc(drive, false)) + if (lock_fdc(drive)) return -EINTR; if (poll_drive(false, 0) == -EINTR) return -EINTR; @@ -3433,7 +3433,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int if (UDRS->fd_ref != 1) /* somebody else has this drive open */ return -EBUSY; - if (lock_fdc(drive, true)) + if (lock_fdc(drive)) return -EINTR; /* do the actual eject. Fails on @@ -3445,7 +3445,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int process_fd_request(); return ret; case FDCLRPRM: - if (lock_fdc(drive, true)) + if (lock_fdc(drive)) return -EINTR; current_type[drive] = NULL; floppy_sizes[drive] = MAX_DISK_SIZE << 1; @@ -3467,7 +3467,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int UDP->flags &= ~FTD_MSG; return 0; case FDFMTBEG: - if (lock_fdc(drive, true)) + if (lock_fdc(drive)) return -EINTR; if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR) return -EINTR; @@ -3484,7 +3484,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int return do_format(drive, &inparam.f); case FDFMTEND: case FDFLUSH: - if (lock_fdc(drive, true)) + if (lock_fdc(drive)) return -EINTR; return invalidate_drive(bdev); case FDSETEMSGTRESH: @@ -3507,7 +3507,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int outparam = UDP; break; case FDPOLLDRVSTAT: - if (lock_fdc(drive, true)) + if (lock_fdc(drive)) return -EINTR; if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR) return -EINTR; @@ -3530,7 +3530,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int case FDRAWCMD: if (type) return -EINVAL; - if (lock_fdc(drive, true)) + if (lock_fdc(drive)) return -EINTR; set_floppy(drive); i = raw_cmd_ioctl(cmd, (void __user *)param); @@ -3539,7 +3539,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int process_fd_request(); return i; case FDTWADDLE: - if (lock_fdc(drive, true)) + if (lock_fdc(drive)) return -EINTR; twaddle(); process_fd_request(); @@ -3747,7 +3747,8 @@ static unsigned int floppy_check_events(struct gendisk *disk, return DISK_EVENT_MEDIA_CHANGE; if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) { - lock_fdc(drive, false); + if (lock_fdc(drive)) + return -EINTR; poll_drive(false, 0); process_fd_request(); } @@ -3845,7 +3846,9 @@ static int floppy_revalidate(struct gendisk *disk) "VFS: revalidate called on non-open device.\n")) return -EFAULT; - lock_fdc(drive, false); + res = lock_fdc(drive); + if (res) + return res; cf = (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || test_bit(FD_VERIFY_BIT, &UDRS->flags)); if (!(cf || test_bit(drive, &fake_change) || drive_no_geom(drive))) { From e502fb8f8801c9561c57397e7fd917187762324e Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 14 Jan 2016 14:41:32 -0800 Subject: [PATCH 04/25] deadline: remove unused struct member commit 63de428b139d3d31d86ebe25ae97b33f6540fb7e ("deadline-iosched: allow non-sequential batching") removed last use of last_sector. Signed-off-by: Tahsin Erdogan Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/deadline-iosched.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index a753df2b3fc29..d0dd7882d8c7f 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -39,7 +39,6 @@ struct deadline_data { */ struct request *next_rq[2]; unsigned int batching; /* number of sequential requests made */ - sector_t last_sector; /* head position */ unsigned int starved; /* times reads have starved writes */ /* @@ -210,8 +209,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) dd->next_rq[WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); - dd->last_sector = rq_end_sector(rq); - /* * take it off the sort and fifo list, move * to dispatch queue From 16c6d048d7b74249a4387700887e8adb13028866 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Thu, 4 Feb 2016 15:13:23 +0100 Subject: [PATCH 05/25] lightnvm: put bio before return MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bio is not returned if the data page cannot be allocated. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/rrpc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index d8c75958ced34..307db1ea22def 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -300,8 +300,10 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk) } page = mempool_alloc(rrpc->page_pool, GFP_NOIO); - if (!page) + if (!page) { + bio_put(bio); return -ENOMEM; + } while ((slot = find_first_zero_bit(rblk->invalid_pages, nr_pgs_per_blk)) < nr_pgs_per_blk) { From bba7f40a029c1e2966146e3a021b3deaf5639904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Thu, 4 Feb 2016 15:13:24 +0100 Subject: [PATCH 06/25] lightnvm: warn if irqs are disabled in lock laddr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a warning if irqs are disabled when locking a new address in rrpc. The typical path to a new request does not disable irqs, but this is not guaranteed in the future. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/rrpc.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index ef13ac7700c80..b0277cbf93d64 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -184,6 +184,8 @@ static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, sector_t laddr_end = laddr + pages - 1; struct rrpc_inflight_rq *rtmp; + WARN_ON(irqs_disabled()); + spin_lock_irq(&rrpc->inflights.lock); list_for_each_entry(rtmp, &rrpc->inflights.reqs, list) { if (unlikely(request_intersects(rtmp, laddr, laddr_end))) { From 3704e098cc1a4c2cabcf4e1cfbbff38b4bfb1ea7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Thu, 4 Feb 2016 15:13:25 +0100 Subject: [PATCH 07/25] lightnvm: fix request intersection locking in rrpc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes an error on the calculation of intersecting logical addresses; it contemplates the case where a new request including several addresses intersects with a single locked address. This case is typical when multiple pages are sent in a new request, while GC - which at the moment sends one address at the time - is running. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/rrpc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index b0277cbf93d64..f7b37336353fd 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -174,8 +174,7 @@ static inline sector_t rrpc_get_sector(sector_t laddr) static inline int request_intersects(struct rrpc_inflight_rq *r, sector_t laddr_start, sector_t laddr_end) { - return (laddr_end >= r->l_start && laddr_end <= r->l_end) && - (laddr_start >= r->l_start && laddr_start <= r->l_end); + return (laddr_end >= r->l_start) && (laddr_start <= r->l_end); } static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, From 6dde1d6c9094a7c20a680aa2196ad6d032ec7ded Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 4 Feb 2016 15:13:26 +0100 Subject: [PATCH 08/25] lightnvm: check overflow and correct mlc pairs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The specification currently limits the number of MLC pairs to 886. Make sure that a device is unable to be instantiate if more is configured. Also, previously the patch had the wrong math for copying MLC pairs, as it only copied half of the actual entries. Fixes: ca5927e7ab53 "lightnvm: introduce mlc lower page table mappings" Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 5cd3725e2fa44..6bb15e4926dc8 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -146,9 +146,10 @@ struct nvme_nvm_command { }; }; +#define NVME_NVM_LP_MLC_PAIRS 886 struct nvme_nvm_lp_mlc { __u16 num_pairs; - __u8 pairs[886]; + __u8 pairs[NVME_NVM_LP_MLC_PAIRS]; }; struct nvme_nvm_lp_tbl { @@ -282,9 +283,14 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) memcpy(dst->lptbl.id, src->lptbl.id, 8); dst->lptbl.mlc.num_pairs = le16_to_cpu(src->lptbl.mlc.num_pairs); - /* 4 bits per pair */ + + if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { + pr_err("nvm: number of MLC pairs not supported\n"); + return -EINVAL; + } + memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, - dst->lptbl.mlc.num_pairs >> 1); + dst->lptbl.mlc.num_pairs); } } From bf64318564c43385ffc3d3dfedab5287bdf3dfdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 4 Feb 2016 15:13:27 +0100 Subject: [PATCH 09/25] lightnvm: allow to force mm initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit System block allows the device to initialize with its configured media manager. The system blocks is written to disk, and read again when media manager is determined. For this to work, the backend must store the data. Device drivers, such as null_blk, does not have any backend storage. This patch allows the media manager to be initialized without a storage backend. It also fix incorrect configuration of capabilities in null_blk, as it does not support get/set bad block interface. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 2 +- drivers/lightnvm/core.c | 25 ++++++++++++++++--------- include/linux/lightnvm.h | 4 ++++ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 8ba1e97d573c3..ae05d31c0559c 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -478,7 +478,7 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) id->ver_id = 0x1; id->vmnt = 0; id->cgrps = 1; - id->cap = 0x3; + id->cap = 0x2; id->dom = 0x1; id->ppaf.blk_offset = 0; diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 33224cb91c5bb..9f6acd5d1d2e9 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -572,11 +572,13 @@ int nvm_register(struct request_queue *q, char *disk_name, } } - ret = nvm_get_sysblock(dev, &dev->sb); - if (!ret) - pr_err("nvm: device not initialized.\n"); - else if (ret < 0) - pr_err("nvm: err (%d) on device initialization\n", ret); + if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) { + ret = nvm_get_sysblock(dev, &dev->sb); + if (!ret) + pr_err("nvm: device not initialized.\n"); + else if (ret < 0) + pr_err("nvm: err (%d) on device initialization\n", ret); + } /* register device with a supported media manager */ down_write(&nvm_lock); @@ -1055,9 +1057,11 @@ static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init) strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN); info.fs_ppa.ppa = -1; - ret = nvm_init_sysblock(dev, &info); - if (ret) - return ret; + if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) { + ret = nvm_init_sysblock(dev, &info); + if (ret) + return ret; + } memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info)); @@ -1117,7 +1121,10 @@ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) dev->mt = NULL; } - return nvm_dev_factory(dev, fact.flags); + if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) + return nvm_dev_factory(dev, fact.flags); + + return 0; } static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index d6750111e48ec..2190419bdf0a0 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -135,6 +135,10 @@ enum { /* Memory types */ NVM_ID_FMTYPE_SLC = 0, NVM_ID_FMTYPE_MLC = 1, + + /* Device capabilities */ + NVM_ID_DCAP_BBLKMGMT = 0x1, + NVM_UD_DCAP_ECC = 0x2, }; struct nvm_id_lp_mlc { From 09954bad448791ef01202351d437abdd9497a804 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Sat, 6 Feb 2016 23:00:22 +0100 Subject: [PATCH 10/25] floppy: refactor open() flags handling In case /dev/fdX is open with O_NDELAY / O_NONBLOCK, floppy_open() immediately succeeds, without performing any further media / controller preparations. That's "correct" wrt. the NODELAY flag, but is hardly correct wrt. the rest of the floppy driver, that is not really O_NONBLOCK ready, at all. Therefore it's not too surprising, that subsequent attempts to work with the filedescriptor produce bad results. Namely, syzkaller tool has been able to livelock mmap() on the returned fd to keep waiting on the page unlock bit forever. Quite frankly, I have trouble defining what non-blocking behavior would be for floppies. Is waiting ages for the driver to actually succeed reading a sector blocking operation? Is waiting for drive motor to start blocking operation? How about in case of virtualized floppies? One option would be returning EWOULDBLOCK in case O_NDLEAY / O_NONBLOCK is being passed to open(). That has a theoretical potential of breaking some arcane and archaic userspace though. Let's take a more conservative aproach, and accept the O_NDLEAY flag, and let the driver behave as usual. While at it, clean up a bit handling of !(mode & (FMODE_READ|FMODE_WRITE)) case and return EINVAL instead of succeeding as well. Spotted by syzkaller tool. Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Signed-off-by: Jiri Kosina --- drivers/block/floppy.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index b206115d761cb..84708a5f8c520 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3663,6 +3663,11 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) opened_bdev[drive] = bdev; + if (!(mode & (FMODE_READ|FMODE_WRITE))) { + res = -EINVAL; + goto out; + } + res = -ENXIO; if (!floppy_track_buffer) { @@ -3706,21 +3711,20 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (UFDCS->rawcmd == 1) UFDCS->rawcmd = 2; - if (!(mode & FMODE_NDELAY)) { - if (mode & (FMODE_READ|FMODE_WRITE)) { - UDRS->last_checked = 0; - clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); - check_disk_change(bdev); - if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) - goto out; - if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) - goto out; - } - res = -EROFS; - if ((mode & FMODE_WRITE) && - !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags)) - goto out; - } + UDRS->last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); + check_disk_change(bdev); + if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) + goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) + goto out; + + res = -EROFS; + + if ((mode & FMODE_WRITE) && + !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags)) + goto out; + mutex_unlock(&open_lock); mutex_unlock(&floppy_mutex); return 0; From d57d611505d911c6f9f81cd9bd6dbd293d66dd9f Mon Sep 17 00:00:00 2001 From: Stephane Gasparini Date: Tue, 9 Feb 2016 17:07:38 +0100 Subject: [PATCH 11/25] kernel/fs: fix I/O wait not accounted for RW O_DSYNC When a process is doing Random Write with O_DSYNC flag the I/O wait are not accounted in the kernel (get_cpu_iowait_time_us). This is preventing the governor or the cpufreq driver to account for I/O wait and thus use the right pstate Signed-off-by: Stephane Gasparini Signed-off-by: Philippe Longepe Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index dbabd48b19347..f53a691b65331 100644 --- a/block/bio.c +++ b/block/bio.c @@ -874,7 +874,7 @@ int submit_bio_wait(int rw, struct bio *bio) bio->bi_private = &ret; bio->bi_end_io = submit_bio_wait_endio; submit_bio(rw, bio); - wait_for_completion(&ret.event); + wait_for_completion_io(&ret.event); return ret.error; } From 21d147880e4895e645f890904784b079f1ba76f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Feb 2016 10:21:22 -0700 Subject: [PATCH 12/25] nvme: fix Kconfig description for BLK_DEV_NVME_SCSI Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/Kconfig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 5d6237391dcd4..b586d84f25188 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -17,5 +17,6 @@ config BLK_DEV_NVME_SCSI and block devices nodes, as well a a translation for a small number of selected SCSI commands to NVMe commands to the NVMe driver. If you don't know what this means you probably want - to say N here, and if you know what it means you probably - want to say N as well. + to say N here, unless you run a distro that abuses the SCSI + emulation to provide stable device names for mount by id, like + some OpenSuSE and SLES versions. From 39a169b62b415390398291080dafe63aec751e0a Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Tue, 9 Feb 2016 12:33:35 -0700 Subject: [PATCH 13/25] block: fix module reference leak on put_disk() call for cgroups throttle get_disk(),get_gendisk() calls have non explicit side effect: they increase the reference on the disk owner module. The following is the correct sequence how to get a disk reference and to put it: disk = get_gendisk(...); /* use disk */ owner = disk->fops->owner; put_disk(disk); module_put(owner); fs/block_dev.c is aware of this required module_put() call, but f.e. blkg_conf_finish(), which is located in block/blk-cgroup.c, does not put a module reference. To see a leakage in action cgroups throttle config can be used. In the following script I'm removing throttle for /dev/ram0 (actually this is NOP, because throttle was never set for this device): # lsmod | grep brd brd 5175 0 # i=100; while [ $i -gt 0 ]; do echo "1:0 0" > \ /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device; i=$(($i - 1)); \ done # lsmod | grep brd brd 5175 100 Now brd module has 100 references. The issue is fixed by calling module_put() just right away put_disk(). Signed-off-by: Roman Pen Cc: Gi-Oh Kim Cc: Tejun Heo Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5a37188b559fb..66e6f1aae02ee 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, { struct gendisk *disk; struct blkcg_gq *blkg; + struct module *owner; unsigned int major, minor; int key_len, part, ret; char *body; @@ -804,7 +805,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (!disk) return -ENODEV; if (part) { + owner = disk->fops->owner; put_disk(disk); + module_put(owner); return -ENODEV; } @@ -820,7 +823,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ret = PTR_ERR(blkg); rcu_read_unlock(); spin_unlock_irq(disk->queue->queue_lock); + owner = disk->fops->owner; put_disk(disk); + module_put(owner); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -851,9 +856,13 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); void blkg_conf_finish(struct blkg_conf_ctx *ctx) __releases(ctx->disk->queue->queue_lock) __releases(rcu) { + struct module *owner; + spin_unlock_irq(ctx->disk->queue->queue_lock); rcu_read_unlock(); + owner = ctx->disk->fops->owner; put_disk(ctx->disk); + module_put(owner); } EXPORT_SYMBOL_GPL(blkg_conf_finish); From a514379b0c77085074abf01c525a850f6d99926e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 11 Feb 2016 14:49:13 +0100 Subject: [PATCH 14/25] null_blk: oops when initializing without lightnvm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the LightNVM subsystem is not compiled into the kernel, and the null_blk device driver requests lightnvm to be initialized. The call to nvm_register fails and the null_add_dev function cleans up the initialization. However, at this point the null block device has already been added to the nullb_list and thus a second cleanup will occur when the function has returned, that leads to a double call to blk_cleanup_queue. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index ae05d31c0559c..64a7b5971b570 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -707,9 +707,7 @@ static int null_add_dev(void) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q); - mutex_lock(&lock); - list_add_tail(&nullb->list, &nullb_list); nullb->index = nullb_indexes++; mutex_unlock(&lock); @@ -743,6 +741,10 @@ static int null_add_dev(void) strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); add_disk(disk); + + mutex_lock(&lock); + list_add_tail(&nullb->list, &nullb_list); + mutex_unlock(&lock); done: return 0; From 5f009d3f8e6685fe8c6215082c1696a08b411220 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 10 Feb 2016 16:52:47 -0700 Subject: [PATCH 15/25] block: Initialize max_dev_sectors to 0 The new queue limit is not used by the majority of block drivers, and should be initialized to 0 for the driver's requested settings to be used. Signed-off-by: Keith Busch Acked-by: Martin K. Petersen Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-settings.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index dd49735839789..c7bb666aafd10 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -91,8 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors = - BLK_SAFE_MAX_SECTORS; + lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; From a59e0f5795fe52dad42a99c00287e3766153b312 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 11 Feb 2016 13:05:38 -0700 Subject: [PATCH 16/25] blk-mq: End unstarted requests on dying queue Go directly to ending a request if it wasn't started. Previously, completing a request may invoke a driver callback for a request it didn't initialize. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c0622fae4138..56c0a726b6193 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -599,8 +599,10 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * If a request wasn't started before the queue was * marked dying, kill it here or it'll go unnoticed. */ - if (unlikely(blk_queue_dying(rq->q))) - blk_mq_complete_request(rq, -EIO); + if (unlikely(blk_queue_dying(rq->q))) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + } return; } From 4f76d0e49835d3da33aa54811157421f7061805e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 11 Feb 2016 13:05:39 -0700 Subject: [PATCH 17/25] NVMe: Fix io incapable return values The function returns true when the controller can't handle IO. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/nvme.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4fb5bb737868c..9664d07d807d8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -139,9 +139,9 @@ static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl) u32 val = 0; if (ctrl->ops->io_incapable(ctrl)) - return false; + return true; if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) - return false; + return true; return val & NVME_CSTS_CFS; } From ef2d4615c59efb312e531a5e949970f37ca1c841 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 11 Feb 2016 13:05:40 -0700 Subject: [PATCH 18/25] NVMe: Allow request merges It is generally more efficient to submit larger IO. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c5bf001af5595..3cd921e6121ec 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1121,7 +1121,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns->queue = blk_mq_init_queue(ctrl->tagset); if (IS_ERR(ns->queue)) goto out_free_ns; - queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); ns->queue->queuedata = ns; ns->ctrl = ctrl; From ae1fba20015bca7401db2422fe18c9c049184163 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 11 Feb 2016 13:05:42 -0700 Subject: [PATCH 19/25] NVMe: Requeue requests on suspended queues It's possible a request may get to the driver after the nvme queue was disabled. This has the request requeue if that happens. Note the request is still "started" by the driver, but requeuing will clear the start state for timeout handling. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 72ef8322d32ac..e5c2bea01dbf5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -678,6 +678,11 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); spin_lock_irq(&nvmeq->q_lock); + if (unlikely(nvmeq->cq_vector < 0)) { + ret = BLK_MQ_RQ_QUEUE_BUSY; + spin_unlock_irq(&nvmeq->q_lock); + goto out; + } __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); From ff23a2a15a2117245b4599c1352343c8b8fb4c43 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 11 Feb 2016 13:05:43 -0700 Subject: [PATCH 20/25] NVMe: Poll device while still active during remove A device failure or link down wouldn't have been detected during namespace removal. This patch keeps the device in the list for polling so that the thread may see such failure and initiate a reset. The device is removed from the list after disable, so we can safely flush the reset work as it can't be requeued when disable completes. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e5c2bea01dbf5..09cc4dafa32a9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2116,16 +2116,12 @@ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - spin_lock(&dev_list_lock); - list_del_init(&dev->node); - spin_unlock(&dev_list_lock); - pci_set_drvdata(pdev, NULL); - flush_work(&dev->reset_work); flush_work(&dev->scan_work); nvme_remove_namespaces(&dev->ctrl); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); + flush_work(&dev->reset_work); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); nvme_release_cmb(dev); From f8e68a7c9af5f8047f7f8295874bedf306063709 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 11 Feb 2016 13:05:47 -0700 Subject: [PATCH 21/25] NVMe: Rate limit nvme IO warnings We don't need to spam the kernel logs with thousands of IO cancelling messages. We can infer all IO's are being cancelled with fewer, or even none at all. This patch rate limits the message and uses the debug log level as it is mainly used for testing purposes. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 09cc4dafa32a9..a128672472ecf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1004,7 +1004,7 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved if (!blk_mq_request_started(req)) return; - dev_warn(nvmeq->q_dmadev, + dev_dbg_ratelimited(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); status = NVME_SC_ABORT_REQ; From 2d99b55d378c996b9692a0c93dd25f4ed5d58934 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 12 Feb 2016 09:39:15 +0100 Subject: [PATCH 22/25] bio: return EINTR if copying to user space got interrupted Commit 35dc248383bbab0a7203fca4d722875bc81ef091 introduced a check for current->mm to see if we have a user space context and only copies data if we do. Now if an IO gets interrupted by a signal data isn't copied into user space any more (as we don't have a user space context) but user space isn't notified about it. This patch modifies the behaviour to return -EINTR from bio_uncopy_user() to notify userland that a signal has interrupted the syscall, otherwise it could lead to a situation where the caller may get a buffer with no data returned. This can be reproduced by issuing SG_IO ioctl()s in one thread while constantly sending signals to it. Fixes: 35dc248 [SCSI] sg: Fix user memory corruption when SG_IO is interrupted by a signal Signed-off-by: Johannes Thumshirn Signed-off-by: Hannes Reinecke Cc: stable@vger.kernel.org # v.3.11+ Signed-off-by: Jens Axboe --- block/bio.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index f53a691b65331..cf7591551b171 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1090,9 +1090,12 @@ int bio_uncopy_user(struct bio *bio) if (!bio_flagged(bio, BIO_NULL_MAPPED)) { /* * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free. + * don't copy into a random user address space, just free + * and return -EINTR so user space doesn't expect any data. */ - if (current->mm && bio_data_dir(bio) == READ) + if (!current->mm) + ret = -EINTR; + else if (bio_data_dir(bio) == READ) ret = bio_copy_to_iter(bio, bmd->iter); if (bmd->is_our_pages) bio_free_pages(bio); From 5ff8eaac1636bf6deae86491f4818c4c69d1a9ac Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Feb 2016 11:34:07 -0700 Subject: [PATCH 23/25] writeback: keep superblock pinned during cgroup writeback association switches If cgroup writeback is in use, an inode is associated with a cgroup for writeback. If the inode's main dirtier changes to another cgroup, the association gets updated asynchronously. Nothing was pinning the superblock while such switches are in progress and superblock could go away while async switching is pending or in progress leading to crashes like the following. kernel BUG at fs/jbd2/transaction.c:319! invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 1 PID: 29158 Comm: kworker/1:10 Not tainted 4.5.0-rc3 #51 Hardware name: Google Google, BIOS Google 01/01/2011 Workqueue: events inode_switch_wbs_work_fn task: ffff880213dbbd40 ti: ffff880209264000 task.ti: ffff880209264000 RIP: 0010:[] [] start_this_handle+0x382/0x3e0 RSP: 0018:ffff880209267c30 EFLAGS: 00010202 ... Call Trace: [] jbd2__journal_start+0xf4/0x190 [] __ext4_journal_start_sb+0x4e/0x70 [] ext4_evict_inode+0x12c/0x3d0 [] evict+0xbb/0x190 [] iput+0x130/0x190 [] inode_switch_wbs_work_fn+0x343/0x4c0 [] process_one_work+0x129/0x300 [] worker_thread+0x126/0x480 [] kthread+0xc4/0xe0 [] ret_from_fork+0x3f/0x70 Fix it by bumping s_active while cgroup association switching is in flight. Signed-off-by: Tejun Heo Reported-and-tested-by: Tahsin Erdogan Link: http://lkml.kernel.org/g/CAAeU0aNCq7LGODvVGRU-oU_o-6enii5ey0p1c26D1ZzYwkDc5A@mail.gmail.com Fixes: d10c80955265 ("writeback: implement foreign cgroup inode bdi_writeback switching") Cc: stable@vger.kernel.org #v4.5+ Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6915c950e6e8a..1f76d8950a57f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -317,6 +317,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; + struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; @@ -423,6 +424,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) wb_put(new_wb); iput(inode); + deactivate_super(sb); kfree(isw); } @@ -469,11 +471,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); + if (inode->i_state & (I_WB_SWITCH | I_FREEING) || - inode_to_wb(inode) == isw->new_wb) { - spin_unlock(&inode->i_lock); - goto out_free; - } + inode_to_wb(inode) == isw->new_wb) + goto out_unlock; + + if (!atomic_inc_not_zero(&inode->i_sb->s_active)) + goto out_unlock; + inode->i_state |= I_WB_SWITCH; spin_unlock(&inode->i_lock); @@ -489,6 +494,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); return; +out_unlock: + spin_unlock(&inode->i_lock); out_free: if (isw->new_wb) wb_put(isw->new_wb); From 3d65ae4634ed8350aee98a4e6f4e41fe40c7d282 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Tue, 16 Feb 2016 13:34:39 -0800 Subject: [PATCH 24/25] writeback: initialize inode members that track writeback history inode struct members that track cgroup writeback information should be reinitialized when inode gets allocated from kmem_cache. Otherwise, their values remain and get used by the new inode. Signed-off-by: Tahsin Erdogan Acked-by: Tejun Heo Fixes: d10c80955265 ("writeback: implement foreign cgroup inode bdi_writeback switching") Signed-off-by: Jens Axboe --- fs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index 9f62db3bcc3ef..69b8b526c1946 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_rdev = 0; inode->dirtied_when = 0; +#ifdef CONFIG_CGROUP_WRITEBACK + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; +#endif + if (security_inode_alloc(inode)) goto out; spin_lock_init(&inode->i_lock); From 18f922d037211a15543af935861bf92161e697e9 Mon Sep 17 00:00:00 2001 From: Alan Date: Wed, 17 Feb 2016 14:15:30 +0000 Subject: [PATCH 25/25] blk: fix overflow in queue_discard_max_hw_show We get this right for queue_discard_max_show but not max_hw_show. Follow the same pattern as queue_discard_max_show instead so that we don't truncate. Signed-off-by: Alan Cox Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e140cc487ce11..dd93763057ce0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -147,10 +147,9 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) { - unsigned long long val; - val = q->limits.max_hw_discard_sectors << 9; - return sprintf(page, "%llu\n", val); + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_hw_discard_sectors << 9); } static ssize_t queue_discard_max_show(struct request_queue *q, char *page)