Skip to content

Commit

Permalink
Merge tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux…
Browse files Browse the repository at this point in the history
…/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - Various DM persistent-data library improvements and fixes that
   benefit both the DM thinp and cache targets.

 - A few small DM kcopyd efficiency improvements.

 - Significant zoned related block core, DM core and DM zoned target
   changes that culminate with adding zoned append emulation (which is
   required to properly fix DM crypt's zoned support).

 - Various DM writecache target changes that improve efficiency. Adds an
   optional "metadata_only" feature that only promotes bios flagged with
   REQ_META. But the most significant improvement is writecache's
   ability to pause writeback, for a confiurable time, if/when the
   working set is larger than the cache (and the cache is full) -- this
   ensures performance is no worse than the slower origin device.

* tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits)
  dm writecache: make writeback pause configurable
  dm writecache: pause writeback if cache full and origin being written directly
  dm io tracker: factor out IO tracker
  dm btree remove: assign new_root only when removal succeeds
  dm zone: fix dm_revalidate_zones() memory allocation
  dm ps io affinity: remove redundant continue statement
  dm writecache: add optional "metadata_only" parameter
  dm writecache: add "cleaner" and "max_age" to Documentation
  dm writecache: write at least 4k when committing
  dm writecache: flush origin device when writing and cache is full
  dm writecache: have ssd writeback wait if the kcopyd workqueue is busy
  dm writecache: use list_move instead of list_del/list_add in writecache_writeback()
  dm writecache: commit just one block, not a full page
  dm writecache: remove unused gfp_t argument from wc_add_block()
  dm crypt: Fix zoned block device support
  dm: introduce zone append emulation
  dm: rearrange core declarations for extended use from dm-zone.c
  block: introduce BIO_ZONE_WRITE_LOCKED bio flag
  block: introduce bio zone helpers
  block: improve handling of all zones reset operation
  ...
  • Loading branch information
Linus Torvalds committed Jul 1, 2021
2 parents dbe69e4 + 5c0de3d commit 2cfa582
Show file tree
Hide file tree
Showing 38 changed files with 2,548 additions and 622 deletions.
25 changes: 23 additions & 2 deletions Documentation/admin-guide/device-mapper/writecache.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ first sector should contain valid superblock from previous invocation.
Constructor parameters:

1. type of the cache device - "p" or "s"

- p - persistent memory
- s - SSD
2. the underlying device that will be cached
Expand All @@ -21,7 +20,6 @@ Constructor parameters:
size)
5. the number of optional parameters (the parameters with an argument
count as two)

start_sector n (default: 0)
offset from the start of cache device in 512-byte sectors
high_watermark n (default: 50)
Expand Down Expand Up @@ -53,6 +51,27 @@ Constructor parameters:

- some underlying devices perform better with fua, some
with nofua. The user should test it
cleaner
when this option is activated (either in the constructor
arguments or by a message), the cache will not promote
new writes (however, writes to already cached blocks are
promoted, to avoid data corruption due to misordered
writes) and it will gradually writeback any cached
data. The userspace can then monitor the cleaning
process with "dmsetup status". When the number of cached
blocks drops to zero, userspace can unload the
dm-writecache target and replace it with dm-linear or
other targets.
max_age n
specifies the maximum age of a block in milliseconds. If
a block is stored in the cache for too long, it will be
written to the underlying device and cleaned up.
metadata_only
only metadata is promoted to the cache. This option
improves performance for heavier REQ_META workloads.
pause_writeback n (default: 3000)
pause writeback if there was some write I/O redirected to
the origin volume in the last n milliseconds

Status:
1. error indicator - 0 if there was no error, otherwise error number
Expand All @@ -77,3 +96,5 @@ Messages:
5. resume the device, so that it will use the linear
target
6. the cache device is now inactive and it can be deleted
cleaner
See above "cleaner" constructor documentation.
119 changes: 92 additions & 27 deletions block/blk-zoned.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,18 +161,89 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);

static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
sector_t sector,
sector_t nr_sectors)
static inline unsigned long *blk_alloc_zone_bitmap(int node,
unsigned int nr_zones)
{
if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
return false;
return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
GFP_NOIO, node);
}

static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
/*
* REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
* of the applicable zone range is the entire disk.
* For an all-zones reset, ignore conventional, empty, read-only
* and offline zones.
*/
return !sector && nr_sectors == get_capacity(bdev->bd_disk);
switch (zone->cond) {
case BLK_ZONE_COND_NOT_WP:
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_READONLY:
case BLK_ZONE_COND_OFFLINE:
return 0;
default:
set_bit(idx, (unsigned long *)data);
return 0;
}
}

static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
gfp_t gfp_mask)
{
struct request_queue *q = bdev_get_queue(bdev);
sector_t capacity = get_capacity(bdev->bd_disk);
sector_t zone_sectors = blk_queue_zone_sectors(q);
unsigned long *need_reset;
struct bio *bio = NULL;
sector_t sector = 0;
int ret;

need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones);
if (!need_reset)
return -ENOMEM;

ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0,
q->nr_zones, blk_zone_need_reset_cb,
need_reset);
if (ret < 0)
goto out_free_need_reset;

ret = 0;
while (sector < capacity) {
if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) {
sector += zone_sectors;
continue;
}

bio = blk_next_bio(bio, 0, gfp_mask);
bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC;
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;

/* This may take a while, so be nice to others */
cond_resched();
}

if (bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
}

out_free_need_reset:
kfree(need_reset);
return ret;
}

static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
{
struct bio bio;

bio_init(&bio, NULL, 0);
bio_set_dev(&bio, bdev);
bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;

return submit_bio_wait(&bio);
}

/**
Expand Down Expand Up @@ -200,7 +271,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
sector_t capacity = get_capacity(bdev->bd_disk);
sector_t end_sector = sector + nr_sectors;
struct bio *bio = NULL;
int ret;
int ret = 0;

if (!blk_queue_is_zoned(q))
return -EOPNOTSUPP;
Expand All @@ -222,20 +293,21 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
return -EINVAL;

/*
* In the case of a zone reset operation over all zones,
* REQ_OP_ZONE_RESET_ALL can be used with devices supporting this
* command. For other devices, we emulate this command behavior by
* identifying the zones needing a reset.
*/
if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
if (!blk_queue_zone_resetall(q))
return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
return blkdev_zone_reset_all(bdev, gfp_mask);
}

while (sector < end_sector) {
bio = blk_next_bio(bio, 0, gfp_mask);
bio_set_dev(bio, bdev);

/*
* Special case for the zone reset operation that reset all
* zones, this is useful for applications like mkfs.
*/
if (op == REQ_OP_ZONE_RESET &&
blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
break;
}

bio->bi_opf = op | REQ_SYNC;
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
Expand Down Expand Up @@ -396,13 +468,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
return ret;
}

static inline unsigned long *blk_alloc_zone_bitmap(int node,
unsigned int nr_zones)
{
return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
GFP_NOIO, node);
}

void blk_queue_free_zone_bitmaps(struct request_queue *q)
{
kfree(q->conv_zones_bitmap);
Expand Down
4 changes: 4 additions & 0 deletions drivers/md/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
endif

ifeq ($(CONFIG_BLK_DEV_ZONED),y)
dm-mod-objs += dm-zone.o
endif

ifeq ($(CONFIG_DM_VERITY_FEC),y)
dm-verity-objs += dm-verity-fec.o
endif
Expand Down
82 changes: 6 additions & 76 deletions drivers/md/dm-cache-target.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "dm-bio-prison-v2.h"
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include "dm-io-tracker.h"

#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
Expand Down Expand Up @@ -39,77 +40,6 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,

/*----------------------------------------------------------------*/

struct io_tracker {
spinlock_t lock;

/*
* Sectors of in-flight IO.
*/
sector_t in_flight;

/*
* The time, in jiffies, when this device became idle (if it is
* indeed idle).
*/
unsigned long idle_time;
unsigned long last_update_time;
};

static void iot_init(struct io_tracker *iot)
{
spin_lock_init(&iot->lock);
iot->in_flight = 0ul;
iot->idle_time = 0ul;
iot->last_update_time = jiffies;
}

static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
if (iot->in_flight)
return false;

return time_after(jiffies, iot->idle_time + jifs);
}

static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
bool r;

spin_lock_irq(&iot->lock);
r = __iot_idle_for(iot, jifs);
spin_unlock_irq(&iot->lock);

return r;
}

static void iot_io_begin(struct io_tracker *iot, sector_t len)
{
spin_lock_irq(&iot->lock);
iot->in_flight += len;
spin_unlock_irq(&iot->lock);
}

static void __iot_io_end(struct io_tracker *iot, sector_t len)
{
if (!len)
return;

iot->in_flight -= len;
if (!iot->in_flight)
iot->idle_time = jiffies;
}

static void iot_io_end(struct io_tracker *iot, sector_t len)
{
unsigned long flags;

spin_lock_irqsave(&iot->lock, flags);
__iot_io_end(iot, len);
spin_unlock_irqrestore(&iot->lock, flags);
}

/*----------------------------------------------------------------*/

/*
* Represents a chunk of future work. 'input' allows continuations to pass
* values between themselves, typically error values.
Expand Down Expand Up @@ -470,7 +400,7 @@ struct cache {
struct batcher committer;
struct work_struct commit_ws;

struct io_tracker tracker;
struct dm_io_tracker tracker;

mempool_t migration_pool;

Expand Down Expand Up @@ -866,15 +796,15 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
if (accountable_bio(cache, bio)) {
pb = get_per_bio_data(bio);
pb->len = bio_sectors(bio);
iot_io_begin(&cache->tracker, pb->len);
dm_iot_io_begin(&cache->tracker, pb->len);
}
}

static void accounted_complete(struct cache *cache, struct bio *bio)
{
struct per_bio_data *pb = get_per_bio_data(bio);

iot_io_end(&cache->tracker, pb->len);
dm_iot_io_end(&cache->tracker, pb->len);
}

static void accounted_request(struct cache *cache, struct bio *bio)
Expand Down Expand Up @@ -1642,7 +1572,7 @@ enum busy {

static enum busy spare_migration_bandwidth(struct cache *cache)
{
bool idle = iot_idle_for(&cache->tracker, HZ);
bool idle = dm_iot_idle_for(&cache->tracker, HZ);
sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
cache->sectors_per_block;

Expand Down Expand Up @@ -2603,7 +2533,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)

batcher_init(&cache->committer, commit_op, cache,
issue_op, cache, cache->wq);
iot_init(&cache->tracker);
dm_iot_init(&cache->tracker);

init_rwsem(&cache->background_work_lock);
prevent_background_work(cache);
Expand Down
Loading

0 comments on commit 2cfa582

Please sign in to comment.