From be628be09563f8f6e81929efbd7cf3f45c344416 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Oct 2016 20:31:17 -0700 Subject: [PATCH 1/6] bcache: Make gc wakeup sane, remove set_task_state() Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 4 ++-- drivers/md/bcache/btree.c | 39 +++++++++++++++++++------------------ drivers/md/bcache/btree.h | 3 +-- drivers/md/bcache/request.c | 4 +--- drivers/md/bcache/super.c | 2 ++ 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6b420a55c7459..c3ea03c9a1a8e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -425,7 +425,7 @@ struct cache { * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu */ - unsigned invalidate_needs_gc:1; + unsigned invalidate_needs_gc; bool discard; /* Get rid of? */ @@ -593,8 +593,8 @@ struct cache_set { /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; + wait_queue_head_t gc_wait; - wait_queue_head_t moving_gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ struct semaphore moving_in_flight; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 6fdd8e252760c..a43eedd5804dd 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1757,32 +1757,34 @@ static void bch_btree_gc(struct cache_set *c) bch_moving_gc(c); } -static int bch_gc_thread(void *arg) +static bool gc_should_run(struct cache_set *c) { - struct cache_set *c = arg; struct cache *ca; unsigned i; - while (1) { -again: - bch_btree_gc(c); + for_each_cache(ca, c, i) + if (ca->invalidate_needs_gc) + return true; - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; + if (atomic_read(&c->sectors_to_gc) < 0) + return true; - mutex_lock(&c->bucket_lock); + return false; +} - for_each_cache(ca, c, i) - if (ca->invalidate_needs_gc) { - mutex_unlock(&c->bucket_lock); - set_current_state(TASK_RUNNING); - goto again; - } +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; - mutex_unlock(&c->bucket_lock); + while (1) { + wait_event_interruptible(c->gc_wait, + kthread_should_stop() || gc_should_run(c)); - schedule(); + if (kthread_should_stop()) + break; + + set_gc_sectors(c); + bch_btree_gc(c); } return 0; @@ -1790,11 +1792,10 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { - c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); if (IS_ERR(c->gc_thread)) return PTR_ERR(c->gc_thread); - set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); return 0; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 5c391fa01bedb..9b80417cd547f 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -260,8 +260,7 @@ void bch_initial_mark_key(struct cache_set *, int, struct bkey *); static inline void wake_up_gc(struct cache_set *c) { - if (c->gc_thread) - wake_up_process(c->gc_thread); + wake_up(&c->gc_wait); } #define MAP_DONE 0 diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f49c5417527dc..76d20875503c1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -196,10 +196,8 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { - set_gc_sectors(op->c); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) wake_up_gc(op->c); - } if (op->bypass) return bch_data_invalidate(cl); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2fb5bfeb43e2e..b33dd3bd104ff 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1489,6 +1489,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); init_waitqueue_head(&c->bucket_wait); + init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); spin_lock_init(&c->btree_gc_time.lock); @@ -1548,6 +1549,7 @@ static void run_cache_set(struct cache_set *c) for_each_cache(ca, c, i) c->nbuckets += ca->sb.nbuckets; + set_gc_sectors(c); if (CACHE_SYNC(&c->sb)) { LIST_HEAD(journal); From b8c0d911ac5285e6be8967713271a51bdc5a936a Mon Sep 17 00:00:00 2001 From: Eric Wheeler Date: Sun, 23 Oct 2016 18:19:20 -0700 Subject: [PATCH 2/6] bcache: partition support: add 16 minors per bcacheN device Signed-off-by: Eric Wheeler Tested-by: Wido den Hollander --- drivers/md/bcache/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b33dd3bd104ff..3a19cbc8b230e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -58,6 +58,7 @@ static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) +#define BCACHE_MINORS 16 /* partition support */ /* Superblock */ @@ -783,8 +784,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (minor < 0) return minor; + minor *= BCACHE_MINORS; + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(d->disk = alloc_disk(1))) { + !(d->disk = alloc_disk(BCACHE_MINORS))) { ida_simple_remove(&bcache_minor, minor); return -ENOMEM; } From e8465447d2f3366069115f7453153561ac9a1220 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Fri, 16 Dec 2016 10:11:56 +0530 Subject: [PATCH 3/6] block: Remove unused member (busy) from struct blk_queue_tag Signed-off-by: Ritesh Harjani Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 286b2a2643833..83695641bd5ec 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -288,7 +288,6 @@ enum blk_queue_state { struct blk_queue_tag { struct request **tag_index; /* map of busy tags */ unsigned long *tag_map; /* bit map of free/busy tags */ - int busy; /* current depth */ int max_depth; /* what we will send to device */ int real_max_depth; /* what the array can hold */ atomic_t refcnt; /* map can be shared */ From 25cdb64510644f3e854d502d69c73f21c6df88a9 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Thu, 15 Dec 2016 11:48:18 -0600 Subject: [PATCH 4/6] block: allow WRITE_SAME commands with the SG_IO ioctl The WRITE_SAME commands are not present in the blk_default_cmd_filter write_ok list, and thus are failed with -EPERM when the SG_IO ioctl() is executed without CAP_SYS_RAWIO capability (e.g., unprivileged users). [ sg_io() -> blk_fill_sghdr_rq() > blk_verify_command() -> -EPERM ] The problem can be reproduced with the sg_write_same command # sg_write_same --num 1 --xferlen 512 /dev/sda # # capsh --drop=cap_sys_rawio -- -c \ 'sg_write_same --num 1 --xferlen 512 /dev/sda' Write same: pass through os error: Operation not permitted # For comparison, the WRITE_VERIFY command does not observe this problem, since it is in that list: # capsh --drop=cap_sys_rawio -- -c \ 'sg_write_verify --num 1 --ilen 512 --lba 0 /dev/sda' # So, this patch adds the WRITE_SAME commands to the list, in order for the SG_IO ioctl to finish successfully: # capsh --drop=cap_sys_rawio -- -c \ 'sg_write_same --num 1 --xferlen 512 /dev/sda' # That case happens to be exercised by QEMU KVM guests with 'scsi-block' devices (qemu "-device scsi-block" [1], libvirt "" [2]), which employs the SG_IO ioctl() and runs as an unprivileged user (libvirt-qemu). In that scenario, when a filesystem (e.g., ext4) performs its zero-out calls, which are translated to write-same calls in the guest kernel, and then into SG_IO ioctls to the host kernel, SCSI I/O errors may be observed in the guest: [...] sd 0:0:0:0: [sda] tag#0 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [...] sd 0:0:0:0: [sda] tag#0 Sense Key : Aborted Command [current] [...] sd 0:0:0:0: [sda] tag#0 Add. Sense: I/O process terminated [...] sd 0:0:0:0: [sda] tag#0 CDB: Write Same(10) 41 00 01 04 e0 78 00 00 08 00 [...] blk_update_request: I/O error, dev sda, sector 17096824 Links: [1] http://git.qemu.org/?p=qemu.git;a=commit;h=336a6915bc7089fb20fea4ba99972ad9a97c5f52 [2] https://libvirt.org/formatdomain.html#elementsDisks (see 'disk' -> 'device') Signed-off-by: Mauricio Faria de Oliveira Signed-off-by: Brahadambal Srinivasan Reported-by: Manjunatha H R Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 0774799942e06..c6fee7437be44 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -182,6 +182,9 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) __set_bit(WRITE_16, filter->write_ok); __set_bit(WRITE_LONG, filter->write_ok); __set_bit(WRITE_LONG_2, filter->write_ok); + __set_bit(WRITE_SAME, filter->write_ok); + __set_bit(WRITE_SAME_16, filter->write_ok); + __set_bit(WRITE_SAME_32, filter->write_ok); __set_bit(ERASE, filter->write_ok); __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); __set_bit(MODE_SELECT, filter->write_ok); From c965809c669da004b660e5923b8add8fac5a7dc8 Mon Sep 17 00:00:00 2001 From: Stephen Bates Date: Fri, 16 Dec 2016 11:54:50 -0700 Subject: [PATCH 5/6] nvme : Use correct scnprintf in cmb show Make sure we are using the correct scnprintf in the sysfs show function for the CMB. Signed-off-by: Stephen Bates Reviewed-by Jon Derrick: Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2fd7dc2e8fc4b..3d21a154dce79 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -50,7 +50,7 @@ #define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) - + /* * We handle AEN commands ourselves and don't even let the * block layer know about them. @@ -1349,7 +1349,7 @@ static ssize_t nvme_cmb_show(struct device *dev, { struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - return snprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", + return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", ndev->cmbloc, ndev->cmbsz); } static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); From 633395b67bb222f85bb8f825c7751a54b9ec84ee Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Mon, 19 Dec 2016 17:15:50 +0100 Subject: [PATCH 6/6] block: check partition alignment Partitions that are not aligned to the blocksize of a device may cause invalid I/O requests because the blocklayer cares only about alignment within the partition when building requests on partitions. device |--------4096--------|--------4096--------|--------4096--------| partition offset 512byte |-512-|--------4096--------|--------4096--------|--------4096--------| When reading/writing one 4k block of the partition this maps to reading/writing with an offset of 512 byte of the device leading to unaligned requests for the device which in turn may cause unexpected behavior of the device driver. For DASD devices we have to translate the block number into a cylinder, head, record format. The unaligned requests lead to wrong calculation and therefore to misdirected I/O. In a "good" case this leads to I/O errors because the underlying hardware detects the wrong addressing. In a worst case scenario this might destroy data on the device. To prevent partitions that are not aligned to the physical blocksize of a device check for the alignment in the blkpg_ioctl. Signed-off-by: Stefan Haberland Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/ioctl.c b/block/ioctl.c index f856963204f49..656c8c6ed206f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -45,6 +45,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user || pstart < 0 || plength < 0 || partno > 65535) return -EINVAL; } + /* check if partition is aligned to blocksize */ + if (p.start & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; mutex_lock(&bdev->bd_mutex);