From 0492b05e8843d26d3075940590642c72a6aa781d Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 26 Oct 2009 22:44:04 +0100
Subject: [PATCH] --- yaml --- r: 172923 b: refs/heads/master c:
 5db5d64277bf390056b1a87d0bb288c8b8553f96 h: refs/heads/master i:   172921:
 b38f037d49d93616523ab7ab41c01e22f0194c77   172919:
 a98259fc22eb95eebafb5022fc0ecfd28e58c3a6 v: v3

---
 [refs]                              |  2 +-
 trunk/block/cfq-iosched.c           | 61 +++++++++++++++++++++++++---
 trunk/drivers/block/drbd/drbd_req.c |  4 +-
 trunk/fs/aio.c                      | 62 +----------------------------
 trunk/fs/block_dev.c                | 12 +-----
 trunk/fs/direct-io.c                | 10 ++---
 trunk/include/linux/backing-dev.h   | 13 ------
 trunk/include/linux/bio.h           |  8 +++-
 trunk/include/linux/blkdev.h        | 13 ++++++
 trunk/include/linux/fs.h            |  2 +
 10 files changed, 88 insertions(+), 99 deletions(-)

diff --git a/[refs] b/[refs]
index 4d9cb98dd0a8..20cbfec0e81d 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 4f570f995f68ef77aae7e5a441222f59232f2d0e
+refs/heads/master: 5db5d64277bf390056b1a87d0bb288c8b8553f96
diff --git a/trunk/block/cfq-iosched.c b/trunk/block/cfq-iosched.c
index 757010d8fb7a..97d946585bc3 100644
--- a/trunk/block/cfq-iosched.c
+++ b/trunk/block/cfq-iosched.c
@@ -27,6 +27,8 @@ static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
+static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
+static const int cfq_hist_divisor = 4;
 
 /*
  * offset from end of service tree
@@ -148,6 +150,8 @@ struct cfq_data {
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
 	unsigned int busy_queues;
+	unsigned int busy_rt_queues;
+	unsigned int busy_queues_avg[2];
 
 	int rq_in_driver[2];
 	int sync_flight;
@@ -315,10 +319,52 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
+/*
+ * get averaged number of queues of RT/BE priority.
+ * average is updated, with a formula that gives more weight to higher numbers,
+ * to quickly follows sudden increases and decrease slowly
+ */
+
+static inline unsigned
+cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) {
+	unsigned min_q, max_q;
+	unsigned mult  = cfq_hist_divisor - 1;
+	unsigned round = cfq_hist_divisor / 2;
+	unsigned busy = cfqd->busy_rt_queues;
+
+	if (!rt)
+		busy = cfqd->busy_queues - cfqd->busy_rt_queues;
+
+	min_q = min(cfqd->busy_queues_avg[rt], busy);
+	max_q = max(cfqd->busy_queues_avg[rt], busy);
+	cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
+		cfq_hist_divisor;
+	return cfqd->busy_queues_avg[rt];
+}
+
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
+	if (cfqd->cfq_latency) {
+		/* interested queues (we consider only the ones with the same
+		 * priority class) */
+		unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
+		unsigned sync_slice = cfqd->cfq_slice[1];
+		unsigned expect_latency = sync_slice * iq;
+		if (expect_latency > cfq_target_latency) {
+			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
+			/* scale low_slice according to IO priority
+			 * and sync vs async */
+			unsigned low_slice =
+				min(slice, base_low_slice * slice / sync_slice);
+			/* the adapted slice value is scaled to fit all iqs
+			 * into the target latency */
+			slice = max(slice * cfq_target_latency / expect_latency,
+				    low_slice);
+		}
+	}
+	cfqq->slice_end = jiffies + slice;
 	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
 
@@ -669,7 +715,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
-
+	if (cfq_class_rt(cfqq))
+		cfqd->busy_rt_queues++;
 	cfq_resort_rr_list(cfqd, cfqq);
 }
 
@@ -692,6 +739,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
+	if (cfq_class_rt(cfqq))
+		cfqd->busy_rt_queues--;
 }
 
 /*
@@ -2359,10 +2408,12 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 			cfqq->ioprio = IOPRIO_NORM;
 	} else {
 		/*
-		 * unboost the queue (if needed)
+		 * check if we need to unboost the queue
 		 */
-		cfqq->ioprio_class = cfqq->org_ioprio_class;
-		cfqq->ioprio = cfqq->org_ioprio;
+		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
+			cfqq->ioprio_class = cfqq->org_ioprio_class;
+		if (cfqq->ioprio != cfqq->org_ioprio)
+			cfqq->ioprio = cfqq->org_ioprio;
 	}
 }
 
diff --git a/trunk/drivers/block/drbd/drbd_req.c b/trunk/drivers/block/drbd/drbd_req.c
index 3678d3d66c6c..d3426ff405b3 100644
--- a/trunk/drivers/block/drbd/drbd_req.c
+++ b/trunk/drivers/block/drbd/drbd_req.c
@@ -40,7 +40,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req
 	part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
 	part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
 	part_stat_unlock();
-	mdev->vdisk->part0.in_flight[rw]++;
+	mdev->vdisk->part0.in_flight++;
 }
 
 /* Update disk stats when completing request upwards */
@@ -53,7 +53,7 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
 	part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
 	part_round_stats(cpu, &mdev->vdisk->part0);
 	part_stat_unlock();
-	mdev->vdisk->part0.in_flight[rw]--;
+	mdev->vdisk->part0.in_flight--;
 }
 
 static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
diff --git a/trunk/fs/aio.c b/trunk/fs/aio.c
index c30dfc006108..02a2c9340573 100644
--- a/trunk/fs/aio.c
+++ b/trunk/fs/aio.c
@@ -15,7 +15,6 @@
 #include <linux/aio_abi.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
-#include <linux/backing-dev.h>
 #include <linux/uio.h>
 
 #define DEBUG 0
@@ -33,9 +32,6 @@
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/eventfd.h>
-#include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/hash.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -64,14 +60,6 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
 
-#define AIO_BATCH_HASH_BITS	3 /* allocated on-stack, so don't go crazy */
-#define AIO_BATCH_HASH_SIZE	(1 << AIO_BATCH_HASH_BITS)
-struct aio_batch_entry {
-	struct hlist_node list;
-	struct address_space *mapping;
-};
-mempool_t *abe_pool;
-
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
 
@@ -85,8 +73,6 @@ static int __init aio_setup(void)
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
 	aio_wq = create_workqueue("aio");
-	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
-	BUG_ON(!abe_pool);
 
 	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
 
@@ -1545,44 +1531,8 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode,
 	return 1;
 }
 
-static void aio_batch_add(struct address_space *mapping,
-			  struct hlist_head *batch_hash)
-{
-	struct aio_batch_entry *abe;
-	struct hlist_node *pos;
-	unsigned bucket;
-
-	bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
-	hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
-		if (abe->mapping == mapping)
-			return;
-	}
-
-	abe = mempool_alloc(abe_pool, GFP_KERNEL);
-	BUG_ON(!igrab(mapping->host));
-	abe->mapping = mapping;
-	hlist_add_head(&abe->list, &batch_hash[bucket]);
-	return;
-}
-
-static void aio_batch_free(struct hlist_head *batch_hash)
-{
-	struct aio_batch_entry *abe;
-	struct hlist_node *pos, *n;
-	int i;
-
-	for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
-		hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
-			blk_run_address_space(abe->mapping);
-			iput(abe->mapping->host);
-			hlist_del(&abe->list);
-			mempool_free(abe, abe_pool);
-		}
-	}
-}
-
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, struct hlist_head *batch_hash)
+			 struct iocb *iocb)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1658,12 +1608,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			;
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
-	if (req->ki_opcode == IOCB_CMD_PREAD ||
-	    req->ki_opcode == IOCB_CMD_PREADV ||
-	    req->ki_opcode == IOCB_CMD_PWRITE ||
-	    req->ki_opcode == IOCB_CMD_PWRITEV)
-		aio_batch_add(file->f_mapping, batch_hash);
-
 	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
 
@@ -1691,7 +1635,6 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 	struct kioctx *ctx;
 	long ret = 0;
 	int i;
-	struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1723,11 +1666,10 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
+		ret = io_submit_one(ctx, user_iocb, &tmp);
 		if (ret)
 			break;
 	}
-	aio_batch_free(batch_hash);
 
 	put_ioctx(ctx);
 	return i ? i : ret;
diff --git a/trunk/fs/block_dev.c b/trunk/fs/block_dev.c
index dde91e7e1c3a..9cf4b926f8e4 100644
--- a/trunk/fs/block_dev.c
+++ b/trunk/fs/block_dev.c
@@ -405,17 +405,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
  
 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
-	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-	int error;
-
-	error = sync_blockdev(bdev);
-	if (error)
-		return error;
-	
-	error = blkdev_issue_flush(bdev, NULL);
-	if (error == -EOPNOTSUPP)
-		error = 0;
-	return error;
+	return sync_blockdev(I_BDEV(filp->f_mapping->host));
 }
 
 /*
diff --git a/trunk/fs/direct-io.c b/trunk/fs/direct-io.c
index 3af761c8c5cc..8b10b87dc01a 100644
--- a/trunk/fs/direct-io.c
+++ b/trunk/fs/direct-io.c
@@ -1028,6 +1028,9 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->bio)
 		dio_bio_submit(dio);
 
+	/* All IO is now issued, send it on its way */
+	blk_run_address_space(inode->i_mapping);
+
 	/*
 	 * It is possible that, we return short IO due to end of file.
 	 * In that case, we need to release all the pages we got hold on.
@@ -1054,11 +1057,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	    ((rw & READ) || (dio->result == dio->size)))
 		ret = -EIOCBQUEUED;
 
-	if (ret != -EIOCBQUEUED) {
-		/* All IO is now issued, send it on its way */
-		blk_run_address_space(inode->i_mapping);
+	if (ret != -EIOCBQUEUED)
 		dio_await_completion(dio);
-	}
 
 	/*
 	 * Sync will always be dropping the final ref and completing the
@@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		rw = WRITE_SYNC_PLUG;
+		rw = WRITE_ODIRECT;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/trunk/include/linux/backing-dev.h b/trunk/include/linux/backing-dev.h
index fcbc26af00e4..b449e738533a 100644
--- a/trunk/include/linux/backing-dev.h
+++ b/trunk/include/linux/backing-dev.h
@@ -331,17 +331,4 @@ static inline int bdi_sched_wait(void *word)
 	return 0;
 }
 
-static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
-				       struct page *page)
-{
-	if (bdi && bdi->unplug_io_fn)
-		bdi->unplug_io_fn(bdi, page);
-}
-
-static inline void blk_run_address_space(struct address_space *mapping)
-{
-	if (mapping)
-		blk_run_backing_dev(mapping->backing_dev_info, NULL);
-}
-
 #endif		/* _LINUX_BACKING_DEV_H */
diff --git a/trunk/include/linux/bio.h b/trunk/include/linux/bio.h
index 474792b825d0..5be93f18d842 100644
--- a/trunk/include/linux/bio.h
+++ b/trunk/include/linux/bio.h
@@ -450,8 +450,11 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
 /*
  * remember never ever reenable interrupts between a bvec_kmap_irq and
  * bvec_kunmap_irq!
+ *
+ * This function MUST be inlined - it plays with the CPU interrupt flags.
  */
-static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
+static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
+		unsigned long *flags)
 {
 	unsigned long addr;
 
@@ -467,7 +470,8 @@ static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
 	return (char *) addr + bvec->bv_offset;
 }
 
-static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
+static __always_inline void bvec_kunmap_irq(char *buffer,
+		unsigned long *flags)
 {
 	unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
 
diff --git a/trunk/include/linux/blkdev.h b/trunk/include/linux/blkdev.h
index 39c601f783a0..221cecd86bd3 100644
--- a/trunk/include/linux/blkdev.h
+++ b/trunk/include/linux/blkdev.h
@@ -823,6 +823,19 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 	return bdev->bd_disk->queue;
 }
 
+static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
+				       struct page *page)
+{
+	if (bdi && bdi->unplug_io_fn)
+		bdi->unplug_io_fn(bdi, page);
+}
+
+static inline void blk_run_address_space(struct address_space *mapping)
+{
+	if (mapping)
+		blk_run_backing_dev(mapping->backing_dev_info, NULL);
+}
+
 /*
  * blk_rq_pos()			: the current sector
  * blk_rq_bytes()		: bytes left in the entire request
diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h
index 2f5fca4147c2..2620a8c63571 100644
--- a/trunk/include/linux/fs.h
+++ b/trunk/include/linux/fs.h
@@ -129,6 +129,7 @@ struct inodes_stat_t {
  * WRITE_SYNC		Like WRITE_SYNC_PLUG, but also unplugs the device
  *			immediately after submission. The write equivalent
  *			of READ_SYNC.
+ * WRITE_ODIRECT	Special case write for O_DIRECT only.
  * SWRITE_SYNC
  * SWRITE_SYNC_PLUG	Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
  *			See SWRITE.
@@ -150,6 +151,7 @@ struct inodes_stat_t {
 #define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define WRITE_SYNC	(WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
+#define WRITE_ODIRECT	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define SWRITE_SYNC_PLUG	\
 			(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define SWRITE_SYNC	(SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))