From 2245a974910ae88497d3f70cffba47e287866e7f Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 20 Apr 2012 13:28:24 -0700 Subject: [PATCH] --- yaml --- r: 309382 b: refs/heads/master c: 09a46e739780aab2eadf47afdefa70c9cd69d83d h: refs/heads/master v: v3 --- [refs] | 2 +- .../bindings/iommu/nvidia,tegra20-gart.txt | 14 - trunk/Documentation/kernel-parameters.txt | 6 - trunk/MAINTAINERS | 6 - trunk/block/Kconfig.iosched | 4 + trunk/block/blk-cgroup.c | 2100 +++++++++++------ trunk/block/blk-cgroup.h | 647 +++-- trunk/block/blk-core.c | 281 +-- trunk/block/blk-ioc.c | 126 +- trunk/block/blk-sysfs.c | 6 +- trunk/block/blk-throttle.c | 697 +++--- trunk/block/blk.h | 32 +- trunk/block/cfq-iosched.c | 1072 +++------ trunk/block/cfq.h | 115 + trunk/block/deadline-iosched.c | 8 +- trunk/block/elevator.c | 121 +- trunk/block/noop-iosched.c | 8 +- trunk/drivers/block/drbd/drbd_actlog.c | 104 +- trunk/drivers/block/drbd/drbd_bitmap.c | 146 +- trunk/drivers/block/drbd/drbd_int.h | 90 +- trunk/drivers/block/drbd/drbd_main.c | 357 +-- trunk/drivers/block/drbd/drbd_nl.c | 48 +- trunk/drivers/block/drbd/drbd_proc.c | 2 +- trunk/drivers/block/drbd/drbd_receiver.c | 95 +- trunk/drivers/block/drbd/drbd_req.c | 132 +- trunk/drivers/block/drbd/drbd_req.h | 19 +- trunk/drivers/block/drbd/drbd_worker.c | 31 +- trunk/drivers/block/floppy.c | 161 +- trunk/drivers/block/xen-blkfront.c | 44 +- trunk/drivers/gpu/drm/drm_crtc.c | 8 +- trunk/drivers/gpu/drm/drm_edid.c | 22 +- trunk/drivers/gpu/drm/i915/i915_debugfs.c | 2 - trunk/drivers/gpu/drm/i915/i915_gem.c | 16 +- trunk/drivers/gpu/drm/i915/i915_irq.c | 37 +- trunk/drivers/gpu/drm/i915/intel_display.c | 56 +- trunk/drivers/gpu/drm/i915/intel_dp.c | 24 +- trunk/drivers/gpu/drm/i915/intel_i2c.c | 21 +- trunk/drivers/gpu/drm/i915/intel_lvds.c | 8 - trunk/drivers/gpu/drm/i915/intel_pm.c | 64 +- trunk/drivers/gpu/drm/i915/intel_sdvo.c | 12 +- trunk/drivers/gpu/drm/i915/intel_sdvo_regs.h | 5 - trunk/drivers/gpu/drm/i915/intel_tv.c | 53 - trunk/drivers/gpu/drm/radeon/ni.c | 2 +- trunk/drivers/gpu/drm/radeon/radeon.h | 1 + .../drivers/gpu/drm/radeon/radeon_atombios.c | 4 +- trunk/drivers/gpu/drm/radeon/radeon_cs.c | 27 +- trunk/drivers/gpu/drm/radeon/radeon_ring.c | 25 + trunk/drivers/gpu/drm/udl/udl_gem.c | 4 +- trunk/drivers/iommu/amd_iommu.c | 37 +- trunk/drivers/iommu/iommu.c | 5 +- trunk/drivers/iommu/omap-iommu.c | 32 +- trunk/drivers/iommu/tegra-gart.c | 20 +- trunk/drivers/iommu/tegra-smmu.c | 2 +- trunk/drivers/remoteproc/remoteproc_core.c | 4 +- trunk/drivers/watchdog/watchdog_dev.c | 2 + trunk/fs/bio.c | 61 - trunk/fs/ioprio.c | 2 +- trunk/fs/splice.c | 4 +- trunk/include/linux/bio.h | 8 - trunk/include/linux/blk_types.h | 10 - trunk/include/linux/blkdev.h | 20 +- trunk/include/linux/drbd.h | 6 +- trunk/include/linux/drbd_limits.h | 7 +- trunk/include/linux/drbd_nl.h | 5 +- trunk/include/linux/elevator.h | 8 +- trunk/include/linux/iocontext.h | 39 +- trunk/include/linux/iommu.h | 10 +- trunk/include/linux/ioprio.h | 22 +- trunk/init/Kconfig | 2 +- trunk/kernel/fork.c | 5 +- trunk/lib/dma-debug.c | 10 +- trunk/mm/hugetlb.c | 3 +- 72 files changed, 3524 insertions(+), 3665 deletions(-) delete mode 100644 trunk/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt create mode 100644 trunk/block/cfq.h diff --git a/[refs] b/[refs] index 99f4c0c3581b..0be9afcd6611 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 6bb340c7868fbfd7bd0e8a0e23397a2bcb528429 +refs/heads/master: 09a46e739780aab2eadf47afdefa70c9cd69d83d diff --git a/trunk/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt b/trunk/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt deleted file mode 100644 index 099d9362ebc1..000000000000 --- a/trunk/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt +++ /dev/null @@ -1,14 +0,0 @@ -NVIDIA Tegra 20 GART - -Required properties: -- compatible: "nvidia,tegra20-gart" -- reg: Two pairs of cells specifying the physical address and size of - the memory controller registers and the GART aperture respectively. - -Example: - - gart { - compatible = "nvidia,tegra20-gart"; - reg = <0x7000f024 0x00000018 /* controller registers */ - 0x58000000 0x02000000>; /* GART aperture */ - }; diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt index c45513d806ab..b40b413db88e 100644 --- a/trunk/Documentation/kernel-parameters.txt +++ b/trunk/Documentation/kernel-parameters.txt @@ -335,12 +335,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. requirements as needed. This option does not override iommu=pt - amd_iommu_dump= [HW,X86-64] - Enable AMD IOMMU driver option to dump the ACPI table - for AMD IOMMU. With this option enabled, AMD IOMMU - driver will print ACPI tables for AMD IOMMU during - IOMMU initialization. - amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS index 64e675d6d478..a246490c95eb 100644 --- a/trunk/MAINTAINERS +++ b/trunk/MAINTAINERS @@ -2818,12 +2818,6 @@ F: Documentation/firmware_class/ F: drivers/base/firmware*.c F: include/linux/firmware.h -FLOPPY DRIVER -M: Jiri Kosina -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/floppy.git -S: Odd fixes -F: drivers/block/floppy.c - FPU EMULATOR M: Bill Metzenthen W: http://floatingpoint.sourceforge.net/emulator/index.html diff --git a/trunk/block/Kconfig.iosched b/trunk/block/Kconfig.iosched index 421bef9c4c48..3199b76f795d 100644 --- a/trunk/block/Kconfig.iosched +++ b/trunk/block/Kconfig.iosched @@ -23,6 +23,8 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" + # If BLK_CGROUP is a module, CFQ has to be built as module. + depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -32,6 +34,8 @@ config IOSCHED_CFQ This is the default I/O scheduler. + Note: If BLK_CGROUP=m, then CFQ can be built only as module. + config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" depends on IOSCHED_CFQ && BLK_CGROUP diff --git a/trunk/block/blk-cgroup.c b/trunk/block/blk-cgroup.c index 02cf6335e9bd..126c341955de 100644 --- a/trunk/block/blk-cgroup.c +++ b/trunk/block/blk-cgroup.c @@ -11,906 +11,1668 @@ * Nauman Rafique */ #include +#include #include #include #include #include #include -#include -#include -#include #include "blk-cgroup.h" -#include "blk.h" +#include #define MAX_KEY_LEN 100 -static DEFINE_MUTEX(blkcg_pol_mutex); +static DEFINE_SPINLOCK(blkio_list_lock); +static LIST_HEAD(blkio_list); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; -EXPORT_SYMBOL_GPL(blkcg_root); +struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; +EXPORT_SYMBOL_GPL(blkio_root_cgroup); -static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +/* for encoding cft->private value on file */ +#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) +/* What policy owns the file, proportional or throttle */ +#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) +#define BLKIOFILE_ATTR(val) ((val) & 0xffff) -struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) +static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) { - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), - struct blkcg, css); + list_add(&pn->node, &blkcg->policy_list); } -EXPORT_SYMBOL_GPL(cgroup_to_blkcg); -static struct blkcg *task_blkcg(struct task_struct *tsk) +static inline bool cftype_blkg_same_policy(struct cftype *cft, + struct blkio_group *blkg) { - return container_of(task_subsys_state(tsk, blkio_subsys_id), - struct blkcg, css); + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + + if (blkg->plid == plid) + return 1; + + return 0; } -struct blkcg *bio_blkcg(struct bio *bio) +/* Determines if policy node matches cgroup file being accessed */ +static inline bool pn_matches_cftype(struct cftype *cft, + struct blkio_policy_node *pn) { - if (bio && bio->bi_css) - return container_of(bio->bi_css, struct blkcg, css); - return task_blkcg(current); + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); + + return (plid == pn->plid && fileid == pn->fileid); } -EXPORT_SYMBOL_GPL(bio_blkcg); -static bool blkcg_policy_enabled(struct request_queue *q, - const struct blkcg_policy *pol) +/* Must be called with blkcg->lock held */ +static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) { - return pol && test_bit(pol->plid, q->blkcg_pols); + list_del(&pn->node); } -/** - * blkg_free - free a blkg - * @blkg: blkg to free - * - * Free @blkg which may be partially allocated. - */ -static void blkg_free(struct blkcg_gq *blkg) +/* Must be called with blkcg->lock held */ +static struct blkio_policy_node * +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, + enum blkio_policy_id plid, int fileid) { - int i; + struct blkio_policy_node *pn; - if (!blkg) - return; + list_for_each_entry(pn, &blkcg->policy_list, node) { + if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) + return pn; + } - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - struct blkg_policy_data *pd = blkg->pd[i]; + return NULL; +} - if (!pd) - continue; +struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkio_cgroup, css); +} +EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); + +struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, blkio_subsys_id), + struct blkio_cgroup, css); +} +EXPORT_SYMBOL_GPL(task_blkio_cgroup); - if (pol && pol->pd_exit_fn) - pol->pd_exit_fn(blkg); +static inline void +blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + struct blkio_policy_type *blkiop; - kfree(pd); + list_for_each_entry(blkiop, &blkio_list, list) { + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + if (blkiop->ops.blkio_update_group_weight_fn) + blkiop->ops.blkio_update_group_weight_fn(blkg->key, + blkg, weight); } - - kfree(blkg); } -/** - * blkg_alloc - allocate a blkg - * @blkcg: block cgroup the new blkg is associated with - * @q: request_queue the new blkg is associated with - * - * Allocate a new blkg assocating @blkcg and @q. - */ -static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) +static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, + int fileid) { - struct blkcg_gq *blkg; - int i; + struct blkio_policy_type *blkiop; - /* alloc and init base part */ - blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); - if (!blkg) - return NULL; + list_for_each_entry(blkiop, &blkio_list, list) { - blkg->q = q; - INIT_LIST_HEAD(&blkg->q_node); - blkg->blkcg = blkcg; - blkg->refcnt = 1; + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + + if (fileid == BLKIO_THROTL_read_bps_device + && blkiop->ops.blkio_update_group_read_bps_fn) + blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, + blkg, bps); + + if (fileid == BLKIO_THROTL_write_bps_device + && blkiop->ops.blkio_update_group_write_bps_fn) + blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, + blkg, bps); + } +} + +static inline void blkio_update_group_iops(struct blkio_group *blkg, + unsigned int iops, int fileid) +{ + struct blkio_policy_type *blkiop; - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - struct blkg_policy_data *pd; + list_for_each_entry(blkiop, &blkio_list, list) { - if (!blkcg_policy_enabled(q, pol)) + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) continue; - /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); - if (!pd) { - blkg_free(blkg); - return NULL; - } + if (fileid == BLKIO_THROTL_read_iops_device + && blkiop->ops.blkio_update_group_read_iops_fn) + blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, + blkg, iops); - blkg->pd[i] = pd; - pd->blkg = blkg; + if (fileid == BLKIO_THROTL_write_iops_device + && blkiop->ops.blkio_update_group_write_iops_fn) + blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, + blkg,iops); } +} - /* invoke per-policy init */ - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; +/* + * Add to the appropriate stat variable depending on the request type. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, + bool sync) +{ + if (direction) + stat[BLKIO_STAT_WRITE] += add; + else + stat[BLKIO_STAT_READ] += add; + if (sync) + stat[BLKIO_STAT_SYNC] += add; + else + stat[BLKIO_STAT_ASYNC] += add; +} - if (blkcg_policy_enabled(blkg->q, pol)) - pol->pd_init_fn(blkg); +/* + * Decrements the appropriate stat variable if non-zero depending on the + * request type. Panics on value being zero. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) +{ + if (direction) { + BUG_ON(stat[BLKIO_STAT_WRITE] == 0); + stat[BLKIO_STAT_WRITE]--; + } else { + BUG_ON(stat[BLKIO_STAT_READ] == 0); + stat[BLKIO_STAT_READ]--; + } + if (sync) { + BUG_ON(stat[BLKIO_STAT_SYNC] == 0); + stat[BLKIO_STAT_SYNC]--; + } else { + BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); + stat[BLKIO_STAT_ASYNC]--; } - - return blkg; } -static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) +#ifdef CONFIG_DEBUG_BLK_CGROUP +/* This should be called with the blkg->stats_lock held. */ +static void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) { - struct blkcg_gq *blkg; + if (blkio_blkg_waiting(&blkg->stats)) + return; + if (blkg == curr_blkg) + return; + blkg->stats.start_group_wait_time = sched_clock(); + blkio_mark_blkg_waiting(&blkg->stats); +} - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; +/* This should be called with the blkg->stats_lock held. */ +static void blkio_update_group_wait_time(struct blkio_group_stats *stats) +{ + unsigned long long now; - /* - * Hint didn't match. Look up from the radix tree. Note that we - * may not be holding queue_lock and thus are not sure whether - * @blkg from blkg_tree has already been removed or not, so we - * can't update hint to the lookup result. Leave it to the caller. - */ - blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) - return blkg; + if (!blkio_blkg_waiting(stats)) + return; - return NULL; + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + stats->group_wait_time += now - stats->start_group_wait_time; + blkio_clear_blkg_waiting(stats); } -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. - */ -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) +/* This should be called with the blkg->stats_lock held. */ +static void blkio_end_empty_time(struct blkio_group_stats *stats) { - WARN_ON_ONCE(!rcu_read_lock_held()); + unsigned long long now; + + if (!blkio_blkg_empty(stats)) + return; - if (unlikely(blk_queue_bypass(q))) - return NULL; - return __blkg_lookup(blkcg, q); + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + stats->empty_time += now - stats->start_empty_time; + blkio_clear_blkg_empty(stats); } -EXPORT_SYMBOL_GPL(blkg_lookup); -static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) - __releases(q->queue_lock) __acquires(q->queue_lock) +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) { - struct blkcg_gq *blkg; - int ret; + unsigned long flags; - WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(q->queue_lock); + spin_lock_irqsave(&blkg->stats_lock, flags); + BUG_ON(blkio_blkg_idling(&blkg->stats)); + blkg->stats.start_idle_time = sched_clock(); + blkio_mark_blkg_idling(&blkg->stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); - /* lookup and update hint on success, see __blkg_lookup() for details */ - blkg = __blkg_lookup(blkcg, q); - if (blkg) { - rcu_assign_pointer(blkcg->blkg_hint, blkg); - return blkg; +void blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + unsigned long long now; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (blkio_blkg_idling(stats)) { + now = sched_clock(); + if (time_after64(now, stats->start_idle_time)) + stats->idle_time += now - stats->start_idle_time; + blkio_clear_blkg_idling(stats); } + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); - /* blkg holds a reference to blkcg */ - if (!css_tryget(&blkcg->css)) - return ERR_PTR(-EINVAL); +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->avg_queue_size_sum += + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; + stats->avg_queue_size_samples++; + blkio_update_group_wait_time(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); - /* allocate */ - ret = -ENOMEM; - blkg = blkg_alloc(blkcg, q); - if (unlikely(!blkg)) - goto err_put; +void blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; - /* insert */ - ret = radix_tree_preload(GFP_ATOMIC); - if (ret) - goto err_free; + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; - spin_lock(&blkcg->lock); - ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); - if (likely(!ret)) { - hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - list_add(&blkg->q_node, &q->blkg_list); + if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; } - spin_unlock(&blkcg->lock); - radix_tree_preload_end(); + /* + * group is already marked empty. This can happen if cfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if(blkio_blkg_empty(stats)) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } - if (!ret) - return blkg; -err_free: - blkg_free(blkg); -err_put: - css_put(&blkcg->css); - return ERR_PTR(ret); + stats->start_empty_time = sched_clock(); + blkio_mark_blkg_empty(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); } +EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) { - /* - * This could be the first entry point of blkcg implementation and - * we shouldn't allow anything to go through for a bypassing queue. - */ - if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); - return __blkg_lookup_create(blkcg, q); + blkg->stats.dequeue += dequeue; +} +EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); +#else +static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) {} +static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} +#endif + +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, + sync); + blkio_end_empty_time(&blkg->stats); + blkio_set_start_group_wait_time(blkg, curr_blkg); + spin_unlock_irqrestore(&blkg->stats_lock, flags); } -EXPORT_SYMBOL_GPL(blkg_lookup_create); +EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); -static void blkg_destroy(struct blkcg_gq *blkg) +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) { - struct request_queue *q = blkg->q; - struct blkcg *blkcg = blkg->blkcg; + unsigned long flags; - lockdep_assert_held(q->queue_lock); - lockdep_assert_held(&blkcg->lock); + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); - /* Something wrong if we are trying to remove same group twice */ - WARN_ON_ONCE(list_empty(&blkg->q_node)); - WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); +void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, + unsigned long unaccounted_time) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkg->stats.time += time; +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg->stats.unaccounted_time += unaccounted_time; +#endif + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); - radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); - list_del_init(&blkg->q_node); - hlist_del_init_rcu(&blkg->blkcg_node); +/* + * should be called under rcu read lock or queue lock to make sure blkg pointer + * is valid. + */ +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) +{ + struct blkio_group_stats_cpu *stats_cpu; + unsigned long flags; /* - * Both setting lookup hint to and clearing it from @blkg are done - * under queue_lock. If it's not pointing to @blkg now, it never - * will. Hint assignment itself can race safely. + * Disabling interrupts to provide mutual exclusion between two + * writes on same cpu. It probably is not needed for 64bit. Not + * optimizing that case yet. */ - if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) - rcu_assign_pointer(blkcg->blkg_hint, NULL); + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(blkg->stats_cpu); + + u64_stats_update_begin(&stats_cpu->syncp); + stats_cpu->sectors += bytes >> 9; + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], + 1, direction, sync); + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], + bytes, direction, sync); + u64_stats_update_end(&stats_cpu->syncp); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); + +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) +{ + struct blkio_group_stats *stats; + unsigned long flags; + unsigned long long now = sched_clock(); + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (time_after64(now, io_start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], + now - io_start_time, direction, sync); + if (time_after64(io_start_time, start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], + io_start_time - start_time, direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); + +/* Merged stats are per cpu. */ +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync) +{ + struct blkio_group_stats_cpu *stats_cpu; + unsigned long flags; /* - * Put the reference taken at the time of creation so that when all - * queues are gone, group can be destroyed. + * Disabling interrupts to provide mutual exclusion between two + * writes on same cpu. It probably is not needed for 64bit. Not + * optimizing that case yet. */ - blkg_put(blkg); + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(blkg->stats_cpu); + + u64_stats_update_begin(&stats_cpu->syncp); + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, + direction, sync); + u64_stats_update_end(&stats_cpu->syncp); + local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); -/** - * blkg_destroy_all - destroy all blkgs associated with a request_queue - * @q: request_queue of interest - * - * Destroy all blkgs associated with @q. +/* + * This function allocates the per cpu stats for blkio_group. Should be called + * from sleepable context as alloc_per_cpu() requires that. */ -static void blkg_destroy_all(struct request_queue *q) +int blkio_alloc_blkg_stats(struct blkio_group *blkg) { - struct blkcg_gq *blkg, *n; + /* Allocate memory for per cpu stats */ + blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); + if (!blkg->stats_cpu) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); - lockdep_assert_held(q->queue_lock); +void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) +{ + unsigned long flags; + + spin_lock_irqsave(&blkcg->lock, flags); + spin_lock_init(&blkg->stats_lock); + rcu_assign_pointer(blkg->key, key); + blkg->blkcg_id = css_id(&blkcg->css); + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + blkg->plid = plid; + spin_unlock_irqrestore(&blkcg->lock, flags); + /* Need to take css reference ? */ + cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); + blkg->dev = dev; +} +EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { - struct blkcg *blkcg = blkg->blkcg; +static void __blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + hlist_del_init_rcu(&blkg->blkcg_node); + blkg->blkcg_id = 0; +} - spin_lock(&blkcg->lock); - blkg_destroy(blkg); - spin_unlock(&blkcg->lock); +/* + * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 + * indicating that blk_group was unhashed by the time we got to it. + */ +int blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + struct blkio_cgroup *blkcg; + unsigned long flags; + struct cgroup_subsys_state *css; + int ret = 1; + + rcu_read_lock(); + css = css_lookup(&blkio_subsys, blkg->blkcg_id); + if (css) { + blkcg = container_of(css, struct blkio_cgroup, css); + spin_lock_irqsave(&blkcg->lock, flags); + if (!hlist_unhashed(&blkg->blkcg_node)) { + __blkiocg_del_blkio_group(blkg); + ret = 0; + } + spin_unlock_irqrestore(&blkcg->lock, flags); } + + rcu_read_unlock(); + return ret; } +EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); -static void blkg_rcu_free(struct rcu_head *rcu_head) +/* called under rcu_read_lock(). */ +struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { - blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); + struct blkio_group *blkg; + struct hlist_node *n; + void *__key; + + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + __key = blkg->key; + if (__key == key) + return blkg; + } + + return NULL; } +EXPORT_SYMBOL_GPL(blkiocg_lookup_group); -void __blkg_release(struct blkcg_gq *blkg) +static void blkio_reset_stats_cpu(struct blkio_group *blkg) { - /* release the extra blkcg reference this blkg has been holding */ - css_put(&blkg->blkcg->css); - + struct blkio_group_stats_cpu *stats_cpu; + int i, j, k; /* - * A group is freed in rcu manner. But having an rcu lock does not - * mean that one can access all the fields of blkg and assume these - * are valid. For example, don't try to follow throtl_data and - * request queue links. + * Note: On 64 bit arch this should not be an issue. This has the + * possibility of returning some inconsistent value on 32bit arch + * as 64bit update on 32bit is non atomic. Taking care of this + * corner case makes code very complicated, like sending IPIs to + * cpus, taking care of stats of offline cpus etc. * - * Having a reference to blkg under an rcu allows acess to only - * values local to groups like group stats and group rate limits + * reset stats is anyway more of a debug feature and this sounds a + * corner case. So I am not complicating the code yet until and + * unless this becomes a real issue. */ - call_rcu(&blkg->rcu_head, blkg_rcu_free); + for_each_possible_cpu(i) { + stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); + stats_cpu->sectors = 0; + for(j = 0; j < BLKIO_STAT_CPU_NR; j++) + for (k = 0; k < BLKIO_STAT_TOTAL; k++) + stats_cpu->stat_arr_cpu[j][k] = 0; + } } -EXPORT_SYMBOL_GPL(__blkg_release); -static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, - u64 val) +static int +blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); - struct blkcg_gq *blkg; + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct blkio_group_stats *stats; struct hlist_node *n; + uint64_t queued[BLKIO_STAT_TOTAL]; int i; +#ifdef CONFIG_DEBUG_BLK_CGROUP + bool idling, waiting, empty; + unsigned long long now = sched_clock(); +#endif - mutex_lock(&blkcg_pol_mutex); + blkcg = cgroup_to_blkio_cgroup(cgroup); spin_lock_irq(&blkcg->lock); - - /* - * Note that stat reset is racy - it doesn't synchronize against - * stat updates. This is a debug feature which shouldn't exist - * anyway. If you get hit by a race, retry. - */ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkcg_policy_enabled(blkg->q, pol) && - pol->pd_reset_stats_fn) - pol->pd_reset_stats_fn(blkg); + spin_lock(&blkg->stats_lock); + stats = &blkg->stats; +#ifdef CONFIG_DEBUG_BLK_CGROUP + idling = blkio_blkg_idling(stats); + waiting = blkio_blkg_waiting(stats); + empty = blkio_blkg_empty(stats); +#endif + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; + memset(stats, 0, sizeof(struct blkio_group_stats)); + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (idling) { + blkio_mark_blkg_idling(stats); + stats->start_idle_time = now; + } + if (waiting) { + blkio_mark_blkg_waiting(stats); + stats->start_group_wait_time = now; } + if (empty) { + blkio_mark_blkg_empty(stats); + stats->start_empty_time = now; + } +#endif + spin_unlock(&blkg->stats_lock); + + /* Reset Per cpu stats which don't take blkg->stats_lock */ + blkio_reset_stats_cpu(blkg); } spin_unlock_irq(&blkcg->lock); - mutex_unlock(&blkcg_pol_mutex); return 0; } -static const char *blkg_dev_name(struct blkcg_gq *blkg) +static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, + int chars_left, bool diskname_only) { - /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info.dev) - return dev_name(blkg->q->backing_dev_info.dev); - return NULL; + snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); + chars_left -= strlen(str); + if (chars_left <= 0) { + printk(KERN_WARNING + "Possibly incorrect cgroup stat display format"); + return; + } + if (diskname_only) + return; + switch (type) { + case BLKIO_STAT_READ: + strlcat(str, " Read", chars_left); + break; + case BLKIO_STAT_WRITE: + strlcat(str, " Write", chars_left); + break; + case BLKIO_STAT_SYNC: + strlcat(str, " Sync", chars_left); + break; + case BLKIO_STAT_ASYNC: + strlcat(str, " Async", chars_left); + break; + case BLKIO_STAT_TOTAL: + strlcat(str, " Total", chars_left); + break; + default: + strlcat(str, " Invalid", chars_left); + } } -/** - * blkcg_print_blkgs - helper for printing per-blkg data - * @sf: seq_file to print to - * @blkcg: blkcg of interest - * @prfill: fill function to print out a blkg - * @pol: policy in question - * @data: data to be passed to @prfill - * @show_total: to print out sum of prfill return values or not - * - * This function invokes @prfill on each blkg of @blkcg if pd for the - * policy specified by @pol exists. @prfill is invoked with @sf, the - * policy data and @data. If @show_total is %true, the sum of the return - * values from @prfill is printed with "Total" label at the end. - * - * This is to be used to construct print functions for - * cftype->read_seq_string method. - */ -void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, - u64 (*prfill)(struct seq_file *, - struct blkg_policy_data *, int), - const struct blkcg_policy *pol, int data, - bool show_total) +static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, + struct cgroup_map_cb *cb, dev_t dev) { - struct blkcg_gq *blkg; - struct hlist_node *n; - u64 total = 0; - - spin_lock_irq(&blkcg->lock); - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) - if (blkcg_policy_enabled(blkg->q, pol)) - total += prfill(sf, blkg->pd[pol->plid], data); - spin_unlock_irq(&blkcg->lock); - - if (show_total) - seq_printf(sf, "Total %llu\n", (unsigned long long)total); + blkio_get_key_name(0, dev, str, chars_left, true); + cb->fill(cb, str, val); + return val; } -EXPORT_SYMBOL_GPL(blkcg_print_blkgs); -/** - * __blkg_prfill_u64 - prfill helper for a single u64 value - * @sf: seq_file to print to - * @pd: policy private data of interest - * @v: value to print - * - * Print @v to @sf for the device assocaited with @pd. - */ -u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) -{ - const char *dname = blkg_dev_name(pd->blkg); - if (!dname) - return 0; +static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, + enum stat_type_cpu type, enum stat_sub_type sub_type) +{ + int cpu; + struct blkio_group_stats_cpu *stats_cpu; + u64 val = 0, tval; + + for_each_possible_cpu(cpu) { + unsigned int start; + stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); + + do { + start = u64_stats_fetch_begin(&stats_cpu->syncp); + if (type == BLKIO_STAT_CPU_SECTORS) + tval = stats_cpu->sectors; + else + tval = stats_cpu->stat_arr_cpu[type][sub_type]; + } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); + + val += tval; + } - seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); - return v; + return val; } -EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/** - * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @rwstat: rwstat to print - * - * Print @rwstat to @sf for the device assocaited with @pd. - */ -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat *rwstat) -{ - static const char *rwstr[] = { - [BLKG_RWSTAT_READ] = "Read", - [BLKG_RWSTAT_WRITE] = "Write", - [BLKG_RWSTAT_SYNC] = "Sync", - [BLKG_RWSTAT_ASYNC] = "Async", - }; - const char *dname = blkg_dev_name(pd->blkg); - u64 v; - int i; +static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, + struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) +{ + uint64_t disk_total, val; + char key_str[MAX_KEY_LEN]; + enum stat_sub_type sub_type; + + if (type == BLKIO_STAT_CPU_SECTORS) { + val = blkio_read_stat_cpu(blkg, type, 0); + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); + } - if (!dname) - return 0; + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; + sub_type++) { + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); + val = blkio_read_stat_cpu(blkg, type, sub_type); + cb->fill(cb, key_str, val); + } - for (i = 0; i < BLKG_RWSTAT_NR; i++) - seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)rwstat->cnt[i]); + disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + + blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); - v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; - seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); - return v; + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, disk_total); + return disk_total; } -/** - * blkg_prfill_stat - prfill callback for blkg_stat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd - * - * prfill callback for printing a blkg_stat. - */ -u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) +/* This should be called with blkg->stats_lock held */ +static uint64_t blkio_get_stat(struct blkio_group *blkg, + struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) { - return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); + uint64_t disk_total; + char key_str[MAX_KEY_LEN]; + enum stat_sub_type sub_type; + + if (type == BLKIO_STAT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.time, cb, dev); +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (type == BLKIO_STAT_UNACCOUNTED_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.unaccounted_time, cb, dev); + if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { + uint64_t sum = blkg->stats.avg_queue_size_sum; + uint64_t samples = blkg->stats.avg_queue_size_samples; + if (samples) + do_div(sum, samples); + else + sum = 0; + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); + } + if (type == BLKIO_STAT_GROUP_WAIT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.group_wait_time, cb, dev); + if (type == BLKIO_STAT_IDLE_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.idle_time, cb, dev); + if (type == BLKIO_STAT_EMPTY_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.empty_time, cb, dev); + if (type == BLKIO_STAT_DEQUEUE) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.dequeue, cb, dev); +#endif + + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; + sub_type++) { + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); + } + disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + + blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, disk_total); + return disk_total; } -EXPORT_SYMBOL_GPL(blkg_prfill_stat); -/** - * blkg_prfill_rwstat - prfill callback for blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @off: offset to the blkg_rwstat in @pd - * - * prfill callback for printing a blkg_rwstat. - */ -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off) +static int blkio_policy_parse_and_set(char *buf, + struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) { - struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); + struct gendisk *disk = NULL; + char *s[4], *p, *major_s = NULL, *minor_s = NULL; + unsigned long major, minor; + int i = 0, ret = -EINVAL; + int part; + dev_t dev; + u64 temp; + + memset(s, 0, sizeof(s)); + + while ((p = strsep(&buf, " ")) != NULL) { + if (!*p) + continue; - return __blkg_prfill_rwstat(sf, pd, &rwstat); + s[i++] = p; + + /* Prevent from inputing too many things */ + if (i == 3) + break; + } + + if (i != 2) + goto out; + + p = strsep(&s[0], ":"); + if (p != NULL) + major_s = p; + else + goto out; + + minor_s = s[0]; + if (!minor_s) + goto out; + + if (strict_strtoul(major_s, 10, &major)) + goto out; + + if (strict_strtoul(minor_s, 10, &minor)) + goto out; + + dev = MKDEV(major, minor); + + if (strict_strtoull(s[1], 10, &temp)) + goto out; + + /* For rule removal, do not check for device presence. */ + if (temp) { + disk = get_gendisk(dev, &part); + if (!disk || part) { + ret = -ENODEV; + goto out; + } + } + + newpn->dev = dev; + + switch (plid) { + case BLKIO_POLICY_PROP: + if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) + goto out; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.weight = temp; + break; + case BLKIO_POLICY_THROTL: + switch(fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.bps = temp; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + if (temp > THROTL_IOPS_MAX) + goto out; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.iops = (unsigned int)temp; + break; + } + break; + default: + BUG(); + } + ret = 0; +out: + put_disk(disk); + return ret; } -EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); -/** - * blkg_conf_prep - parse and prepare for per-blkg config update - * @blkcg: target block cgroup - * @pol: target policy - * @input: input string - * @ctx: blkg_conf_ctx to be filled - * - * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new - * value. This function returns with RCU read lock and queue lock held and - * must be paired with blkg_conf_finish(). - */ -int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(disk->queue->queue_lock) +unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, + dev_t dev) { - struct gendisk *disk; - struct blkcg_gq *blkg; - unsigned int major, minor; - unsigned long long v; - int part, ret; + struct blkio_policy_node *pn; + unsigned long flags; + unsigned int weight; - if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) - return -EINVAL; + spin_lock_irqsave(&blkcg->lock, flags); - disk = get_gendisk(MKDEV(major, minor), &part); - if (!disk || part) - return -EINVAL; + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device); + if (pn) + weight = pn->val.weight; + else + weight = blkcg->weight; - rcu_read_lock(); - spin_lock_irq(disk->queue->queue_lock); + spin_unlock_irqrestore(&blkcg->lock, flags); - if (blkcg_policy_enabled(disk->queue, pol)) - blkg = blkg_lookup_create(blkcg, disk->queue); - else - blkg = ERR_PTR(-EINVAL); + return weight; +} +EXPORT_SYMBOL_GPL(blkcg_get_weight); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - rcu_read_unlock(); - spin_unlock_irq(disk->queue->queue_lock); - put_disk(disk); - /* - * If queue was bypassing, we should retry. Do so after a - * short msleep(). It isn't strictly necessary but queue - * can be bypassing for some time and it's always nice to - * avoid busy looping. - */ - if (ret == -EBUSY) { - msleep(10); - ret = restart_syscall(); +uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + unsigned long flags; + uint64_t bps = -1; + + spin_lock_irqsave(&blkcg->lock, flags); + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device); + if (pn) + bps = pn->val.bps; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return bps; +} + +uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + unsigned long flags; + uint64_t bps = -1; + + spin_lock_irqsave(&blkcg->lock, flags); + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device); + if (pn) + bps = pn->val.bps; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return bps; +} + +unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + unsigned long flags; + unsigned int iops = -1; + + spin_lock_irqsave(&blkcg->lock, flags); + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device); + if (pn) + iops = pn->val.iops; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return iops; +} + +unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + unsigned long flags; + unsigned int iops = -1; + + spin_lock_irqsave(&blkcg->lock, flags); + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device); + if (pn) + iops = pn->val.iops; + spin_unlock_irqrestore(&blkcg->lock, flags); + + return iops; +} + +/* Checks whether user asked for deleting a policy rule */ +static bool blkio_delete_rule_command(struct blkio_policy_node *pn) +{ + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->val.weight == 0) + return 1; + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + if (pn->val.bps == 0) + return 1; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + if (pn->val.iops == 0) + return 1; } - return ret; + break; + default: + BUG(); } - ctx->disk = disk; - ctx->blkg = blkg; - ctx->v = v; return 0; } -EXPORT_SYMBOL_GPL(blkg_conf_prep); -/** - * blkg_conf_finish - finish up per-blkg config update - * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() - * - * Finish up after per-blkg config update. This function must be paired - * with blkg_conf_prep(). - */ -void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(ctx->disk->queue->queue_lock) __releases(rcu) +static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, + struct blkio_policy_node *newpn) { - spin_unlock_irq(ctx->disk->queue->queue_lock); - rcu_read_unlock(); - put_disk(ctx->disk); + switch(oldpn->plid) { + case BLKIO_POLICY_PROP: + oldpn->val.weight = newpn->val.weight; + break; + case BLKIO_POLICY_THROTL: + switch(newpn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + oldpn->val.bps = newpn->val.bps; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + oldpn->val.iops = newpn->val.iops; + } + break; + default: + BUG(); + } } -EXPORT_SYMBOL_GPL(blkg_conf_finish); -struct cftype blkcg_files[] = { - { - .name = "reset_stats", - .write_u64 = blkcg_reset_stats, - }, - { } /* terminate */ -}; +/* + * Some rules/values in blkg have changed. Propagate those to respective + * policies. + */ +static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, struct blkio_policy_node *pn) +{ + unsigned int weight, iops; + u64 bps; + + switch(pn->plid) { + case BLKIO_POLICY_PROP: + weight = pn->val.weight ? pn->val.weight : + blkcg->weight; + blkio_update_group_weight(blkg, weight); + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + bps = pn->val.bps ? pn->val.bps : (-1); + blkio_update_group_bps(blkg, bps, pn->fileid); + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + iops = pn->val.iops ? pn->val.iops : (-1); + blkio_update_group_iops(blkg, iops, pn->fileid); + break; + } + break; + default: + BUG(); + } +} -/** - * blkcg_pre_destroy - cgroup pre_destroy callback - * @cgroup: cgroup of interest - * - * This function is called when @cgroup is about to go away and responsible - * for shooting down all blkgs associated with @cgroup. blkgs should be - * removed while holding both q and blkcg locks. As blkcg lock is nested - * inside q lock, this function performs reverse double lock dancing. - * - * This is the blkcg counterpart of ioc_release_fn(). +/* + * A policy node rule has been updated. Propagate this update to all the + * block groups which might be affected by this update. */ -static int blkcg_pre_destroy(struct cgroup *cgroup) +static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkio_group *blkg; + struct hlist_node *n; + spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); - while (!hlist_empty(&blkcg->blkg_list)) { - struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, - struct blkcg_gq, blkcg_node); - struct request_queue *q = blkg->q; - - if (spin_trylock(q->queue_lock)) { - blkg_destroy(blkg); - spin_unlock(q->queue_lock); - } else { - spin_unlock_irq(&blkcg->lock); - cpu_relax(); - spin_lock_irq(&blkcg->lock); - } + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (pn->dev != blkg->dev || pn->plid != blkg->plid) + continue; + blkio_update_blkg_policy(blkcg, blkg, pn); } spin_unlock_irq(&blkcg->lock); - return 0; + spin_unlock(&blkio_list_lock); } -static void blkcg_destroy(struct cgroup *cgroup) +static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + int ret = 0; + char *buf; + struct blkio_policy_node *newpn, *pn; + struct blkio_cgroup *blkcg; + int keep_newpn = 0; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); + + buf = kstrdup(buffer, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); + if (!newpn) { + ret = -ENOMEM; + goto free_buf; + } - if (blkcg != &blkcg_root) - kfree(blkcg); -} + ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); + if (ret) + goto free_newpn; -static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup) -{ - static atomic64_t id_seq = ATOMIC64_INIT(0); - struct blkcg *blkcg; - struct cgroup *parent = cgroup->parent; + blkcg = cgroup_to_blkio_cgroup(cgrp); - if (!parent) { - blkcg = &blkcg_root; - goto done; + spin_lock_irq(&blkcg->lock); + + pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); + if (!pn) { + if (!blkio_delete_rule_command(newpn)) { + blkio_policy_insert_node(blkcg, newpn); + keep_newpn = 1; + } + spin_unlock_irq(&blkcg->lock); + goto update_io_group; } - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) - return ERR_PTR(-ENOMEM); + if (blkio_delete_rule_command(newpn)) { + blkio_policy_delete_node(pn); + kfree(pn); + spin_unlock_irq(&blkcg->lock); + goto update_io_group; + } + spin_unlock_irq(&blkcg->lock); - blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; - blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ -done: - spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); - INIT_HLIST_HEAD(&blkcg->blkg_list); + blkio_update_policy_rule(pn, newpn); - return &blkcg->css; +update_io_group: + blkio_update_policy_node_blkg(blkcg, newpn); + +free_newpn: + if (!keep_newpn) + kfree(newpn); +free_buf: + kfree(buf); + return ret; } -/** - * blkcg_init_queue - initialize blkcg part of request queue - * @q: request_queue to initialize - * - * Called from blk_alloc_queue_node(). Responsible for initializing blkcg - * part of new request_queue @q. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int blkcg_init_queue(struct request_queue *q) +static void +blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) { - might_sleep(); - - return blk_throtl_init(q); + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->fileid == BLKIO_PROP_weight_device) + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.weight); + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.bps); + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.iops); + break; + } + break; + default: + BUG(); + } } -/** - * blkcg_drain_queue - drain blkcg part of request_queue - * @q: request_queue to drain - * - * Called from blk_drain_queue(). Responsible for draining blkcg part. - */ -void blkcg_drain_queue(struct request_queue *q) +/* cgroup files which read their data from policy nodes end up here */ +static void blkio_read_policy_node_files(struct cftype *cft, + struct blkio_cgroup *blkcg, struct seq_file *m) { - lockdep_assert_held(q->queue_lock); - - blk_throtl_drain(q); + struct blkio_policy_node *pn; + + if (!list_empty(&blkcg->policy_list)) { + spin_lock_irq(&blkcg->lock); + list_for_each_entry(pn, &blkcg->policy_list, node) { + if (!pn_matches_cftype(cft, pn)) + continue; + blkio_print_policy_node(m, pn); + } + spin_unlock_irq(&blkcg->lock); + } } -/** - * blkcg_exit_queue - exit and release blkcg part of request_queue - * @q: request_queue being released - * - * Called from blk_release_queue(). Responsible for exiting blkcg part. - */ -void blkcg_exit_queue(struct request_queue *q) +static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) { - spin_lock_irq(q->queue_lock); - blkg_destroy_all(q); - spin_unlock_irq(q->queue_lock); + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; + default: + BUG(); + } - blk_throtl_exit(q); + return 0; } -/* - * We cannot support shared io contexts, as we have no mean to support - * two tasks with the same ioc in two different groups without major rework - * of the main cic data structures. For now we allow a task to change - * its cgroup only if it's the only owner of its ioc. - */ -static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, + struct cftype *cft, struct cgroup_map_cb *cb, + enum stat_type type, bool show_total, bool pcpu) { - struct task_struct *task; - struct io_context *ioc; - int ret = 0; + struct blkio_group *blkg; + struct hlist_node *n; + uint64_t cgroup_total = 0; - /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, cgrp, tset) { - task_lock(task); - ioc = task->io_context; - if (ioc && atomic_read(&ioc->nr_tasks) > 1) - ret = -EINVAL; - task_unlock(task); - if (ret) - break; + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (blkg->dev) { + if (!cftype_blkg_same_policy(cft, blkg)) + continue; + if (pcpu) + cgroup_total += blkio_get_stat_cpu(blkg, cb, + blkg->dev, type); + else { + spin_lock_irq(&blkg->stats_lock); + cgroup_total += blkio_get_stat(blkg, cb, + blkg->dev, type); + spin_unlock_irq(&blkg->stats_lock); + } + } } - return ret; + if (show_total) + cb->fill(cb, "Total", cgroup_total); + rcu_read_unlock(); + return 0; } -struct cgroup_subsys blkio_subsys = { - .name = "blkio", - .create = blkcg_create, - .can_attach = blkcg_can_attach, - .pre_destroy = blkcg_pre_destroy, - .destroy = blkcg_destroy, - .subsys_id = blkio_subsys_id, - .base_cftypes = blkcg_files, - .module = THIS_MODULE, -}; -EXPORT_SYMBOL_GPL(blkio_subsys); - -/** - * blkcg_activate_policy - activate a blkcg policy on a request_queue - * @q: request_queue of interest - * @pol: blkcg policy to activate - * - * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through - * bypass mode to populate its blkgs with policy_data for @pol. - * - * Activation happens with @q bypassed, so nobody would be accessing blkgs - * from IO path. Update of each blkg is protected by both queue and blkcg - * locks so that holding either lock and testing blkcg_policy_enabled() is - * always enough for dereferencing policy data. - * - * The caller is responsible for synchronizing [de]activations and policy - * [un]registerations. Returns 0 on success, -errno on failure. - */ -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) +/* All map kind of cgroup file get serviced by this function */ +static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) { - LIST_HEAD(pds); - struct blkcg_gq *blkg; - struct blkg_policy_data *pd, *n; - int cnt = 0, ret; + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_TIME, 0, 0); + case BLKIO_PROP_sectors: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SECTORS, 0, 1); + case BLKIO_PROP_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); + case BLKIO_PROP_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SERVICED, 1, 1); + case BLKIO_PROP_io_service_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_TIME, 1, 0); + case BLKIO_PROP_io_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_WAIT_TIME, 1, 0); + case BLKIO_PROP_io_merged: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_MERGED, 1, 1); + case BLKIO_PROP_io_queued: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_QUEUED, 1, 0); +#ifdef CONFIG_DEBUG_BLK_CGROUP + case BLKIO_PROP_unaccounted_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); + case BLKIO_PROP_dequeue: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_DEQUEUE, 0, 0); + case BLKIO_PROP_avg_queue_size: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); + case BLKIO_PROP_group_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); + case BLKIO_PROP_idle_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_IDLE_TIME, 0, 0); + case BLKIO_PROP_empty_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_EMPTY_TIME, 0, 0); +#endif + default: + BUG(); + } + break; + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); + case BLKIO_THROTL_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SERVICED, 1, 1); + default: + BUG(); + } + break; + default: + BUG(); + } - if (blkcg_policy_enabled(q, pol)) - return 0; + return 0; +} - blk_queue_bypass_start(q); +static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) +{ + struct blkio_group *blkg; + struct hlist_node *n; + struct blkio_policy_node *pn; - /* make sure the root blkg exists and count the existing blkgs */ - spin_lock_irq(q->queue_lock); + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + return -EINVAL; - rcu_read_lock(); - blkg = __blkg_lookup_create(&blkcg_root, q); - rcu_read_unlock(); + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + blkcg->weight = (unsigned int)val; - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto out_unlock; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + pn = blkio_policy_search_node(blkcg, blkg->dev, + BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); + if (pn) + continue; + + blkio_update_group_weight(blkg, blkcg->weight); } - q->root_blkg = blkg; + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); + return 0; +} - list_for_each_entry(blkg, &q->blkg_list, q_node) - cnt++; +static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); - spin_unlock_irq(q->queue_lock); + blkcg = cgroup_to_blkio_cgroup(cgrp); - /* allocate policy_data for all existing blkgs */ - while (cnt--) { - pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); - if (!pd) { - ret = -ENOMEM; - goto out_free; + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return (u64)blkcg->weight; } - list_add_tail(&pd->alloc_node, &pds); + break; + default: + BUG(); } + return 0; +} - /* - * Install the allocated pds. With @q bypassing, no new blkg - * should have been created while the queue lock was dropped. - */ - spin_lock_irq(q->queue_lock); +static int +blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); - list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { - /* umm... this shouldn't happen, just abort */ - ret = -ENOMEM; - goto out_unlock; + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return blkio_weight_write(blkcg, val); } - pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); - list_del_init(&pd->alloc_node); + break; + default: + BUG(); + } + + return 0; +} + +struct cftype blkio_files[] = { + { + .name = "weight_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + { + .name = "weight", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight), + .read_u64 = blkiocg_file_read_u64, + .write_u64 = blkiocg_file_write_u64, + }, + { + .name = "time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "sectors", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_sectors), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_service_bytes", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_bytes), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_serviced", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_serviced), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_service_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_wait_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_wait_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_merged", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_merged), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_queued", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_queued), + .read_map = blkiocg_file_read_map, + }, + { + .name = "reset_stats", + .write_u64 = blkiocg_reset_stats, + }, +#ifdef CONFIG_BLK_DEV_THROTTLING + { + .name = "throttle.read_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, - /* grab blkcg lock too while installing @pd on @blkg */ - spin_lock(&blkg->blkcg->lock); + { + .name = "throttle.write_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, - blkg->pd[pol->plid] = pd; - pd->blkg = blkg; - pol->pd_init_fn(blkg); + { + .name = "throttle.read_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, - spin_unlock(&blkg->blkcg->lock); - } + { + .name = "throttle.write_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + { + .name = "throttle.io_service_bytes", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_service_bytes), + .read_map = blkiocg_file_read_map, + }, + { + .name = "throttle.io_serviced", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_serviced), + .read_map = blkiocg_file_read_map, + }, +#endif /* CONFIG_BLK_DEV_THROTTLING */ - __set_bit(pol->plid, q->blkcg_pols); - ret = 0; -out_unlock: - spin_unlock_irq(q->queue_lock); -out_free: - blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, n, &pds, alloc_node) - kfree(pd); - return ret; -} -EXPORT_SYMBOL_GPL(blkcg_activate_policy); +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "avg_queue_size", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_avg_queue_size), + .read_map = blkiocg_file_read_map, + }, + { + .name = "group_wait_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_group_wait_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "idle_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_idle_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "empty_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_empty_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "dequeue", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_dequeue), + .read_map = blkiocg_file_read_map, + }, + { + .name = "unaccounted_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_unaccounted_time), + .read_map = blkiocg_file_read_map, + }, +#endif + { } /* terminate */ +}; -/** - * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue - * @q: request_queue of interest - * @pol: blkcg policy to deactivate - * - * Deactivate @pol on @q. Follows the same synchronization rules as - * blkcg_activate_policy(). - */ -void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol) +static void blkiocg_destroy(struct cgroup *cgroup) { - struct blkcg_gq *blkg; + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + unsigned long flags; + struct blkio_group *blkg; + void *key; + struct blkio_policy_type *blkiop; + struct blkio_policy_node *pn, *pntmp; - if (!blkcg_policy_enabled(q, pol)) - return; + rcu_read_lock(); + do { + spin_lock_irqsave(&blkcg->lock, flags); + + if (hlist_empty(&blkcg->blkg_list)) { + spin_unlock_irqrestore(&blkcg->lock, flags); + break; + } - blk_queue_bypass_start(q); - spin_lock_irq(q->queue_lock); + blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, + blkcg_node); + key = rcu_dereference(blkg->key); + __blkiocg_del_blkio_group(blkg); - __clear_bit(pol->plid, q->blkcg_pols); + spin_unlock_irqrestore(&blkcg->lock, flags); - /* if no policy is left, no need for blkgs - shoot them down */ - if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) - blkg_destroy_all(q); + /* + * This blkio_group is being unlinked as associated cgroup is + * going away. Let all the IO controlling policies know about + * this event. + */ + spin_lock(&blkio_list_lock); + list_for_each_entry(blkiop, &blkio_list, list) { + if (blkiop->plid != blkg->plid) + continue; + blkiop->ops.blkio_unlink_group_fn(key, blkg); + } + spin_unlock(&blkio_list_lock); + } while (1); - list_for_each_entry(blkg, &q->blkg_list, q_node) { - /* grab blkcg lock too while removing @pd from @blkg */ - spin_lock(&blkg->blkcg->lock); + list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { + blkio_policy_delete_node(pn); + kfree(pn); + } - if (pol->pd_exit_fn) - pol->pd_exit_fn(blkg); + free_css_id(&blkio_subsys, &blkcg->css); + rcu_read_unlock(); + if (blkcg != &blkio_root_cgroup) + kfree(blkcg); +} - kfree(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; +static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg; + struct cgroup *parent = cgroup->parent; - spin_unlock(&blkg->blkcg->lock); + if (!parent) { + blkcg = &blkio_root_cgroup; + goto done; } - spin_unlock_irq(q->queue_lock); - blk_queue_bypass_end(q); + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) + return ERR_PTR(-ENOMEM); + + blkcg->weight = BLKIO_WEIGHT_DEFAULT; +done: + spin_lock_init(&blkcg->lock); + INIT_HLIST_HEAD(&blkcg->blkg_list); + + INIT_LIST_HEAD(&blkcg->policy_list); + return &blkcg->css; } -EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); -/** - * blkcg_policy_register - register a blkcg policy - * @pol: blkcg policy to register - * - * Register @pol with blkcg core. Might sleep and @pol may be modified on - * successful registration. Returns 0 on success and -errno on failure. +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. For now we allow a task to change + * its cgroup only if it's the only owner of its ioc. */ -int blkcg_policy_register(struct blkcg_policy *pol) +static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { - int i, ret; - - if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) - return -EINVAL; - - mutex_lock(&blkcg_pol_mutex); + struct task_struct *task; + struct io_context *ioc; + int ret = 0; - /* find an empty slot */ - ret = -ENOSPC; - for (i = 0; i < BLKCG_MAX_POLS; i++) - if (!blkcg_policy[i]) + /* task_lock() is needed to avoid races with exit_io_context() */ + cgroup_taskset_for_each(task, cgrp, tset) { + task_lock(task); + ioc = task->io_context; + if (ioc && atomic_read(&ioc->nr_tasks) > 1) + ret = -EINVAL; + task_unlock(task); + if (ret) break; - if (i >= BLKCG_MAX_POLS) - goto out_unlock; + } + return ret; +} - /* register and update blkgs */ - pol->plid = i; - blkcg_policy[i] = pol; +static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct io_context *ioc; - /* everything is in place, add intf files for the new policy */ - if (pol->cftypes) - WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); - ret = 0; -out_unlock: - mutex_unlock(&blkcg_pol_mutex); - return ret; + cgroup_taskset_for_each(task, cgrp, tset) { + /* we don't lose anything even if ioc allocation fails */ + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + ioc_cgroup_changed(ioc); + put_io_context(ioc); + } + } } -EXPORT_SYMBOL_GPL(blkcg_policy_register); -/** - * blkcg_policy_unregister - unregister a blkcg policy - * @pol: blkcg policy to unregister - * - * Undo blkcg_policy_register(@pol). Might sleep. - */ -void blkcg_policy_unregister(struct blkcg_policy *pol) +struct cgroup_subsys blkio_subsys = { + .name = "blkio", + .create = blkiocg_create, + .can_attach = blkiocg_can_attach, + .attach = blkiocg_attach, + .destroy = blkiocg_destroy, +#ifdef CONFIG_BLK_CGROUP + /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ + .subsys_id = blkio_subsys_id, +#endif + .base_cftypes = blkio_files, + .use_id = 1, + .module = THIS_MODULE, +}; +EXPORT_SYMBOL_GPL(blkio_subsys); + +void blkio_policy_register(struct blkio_policy_type *blkiop) { - mutex_lock(&blkcg_pol_mutex); + spin_lock(&blkio_list_lock); + list_add_tail(&blkiop->list, &blkio_list); + spin_unlock(&blkio_list_lock); +} +EXPORT_SYMBOL_GPL(blkio_policy_register); - if (WARN_ON(blkcg_policy[pol->plid] != pol)) - goto out_unlock; +void blkio_policy_unregister(struct blkio_policy_type *blkiop) +{ + spin_lock(&blkio_list_lock); + list_del_init(&blkiop->list); + spin_unlock(&blkio_list_lock); +} +EXPORT_SYMBOL_GPL(blkio_policy_unregister); - /* kill the intf files first */ - if (pol->cftypes) - cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); +static int __init init_cgroup_blkio(void) +{ + return cgroup_load_subsys(&blkio_subsys); +} - /* unregister and update blkgs */ - blkcg_policy[pol->plid] = NULL; -out_unlock: - mutex_unlock(&blkcg_pol_mutex); +static void __exit exit_cgroup_blkio(void) +{ + cgroup_unload_subsys(&blkio_subsys); } -EXPORT_SYMBOL_GPL(blkcg_policy_unregister); + +module_init(init_cgroup_blkio); +module_exit(exit_cgroup_blkio); +MODULE_LICENSE("GPL"); diff --git a/trunk/block/blk-cgroup.h b/trunk/block/blk-cgroup.h index 8ac457ce7783..6f3ace7e792f 100644 --- a/trunk/block/blk-cgroup.h +++ b/trunk/block/blk-cgroup.h @@ -15,371 +15,350 @@ #include #include -#include -#include + +enum blkio_policy_id { + BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ + BLKIO_POLICY_THROTL, /* Throttling */ +}; /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX -/* CFQ specific, out here for blkcg->cfq_weight */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 - -#ifdef CONFIG_BLK_CGROUP - -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) + +#ifndef CONFIG_BLK_CGROUP +/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ +extern struct cgroup_subsys blkio_subsys; +#define blkio_subsys_id blkio_subsys.subsys_id +#endif + +enum stat_type { + /* Total time spent (in ns) between request dispatch to the driver and + * request completion for IOs doen by this cgroup. This may not be + * accurate when NCQ is turned on. */ + BLKIO_STAT_SERVICE_TIME = 0, + /* Total time spent waiting in scheduler queue in ns */ + BLKIO_STAT_WAIT_TIME, + /* Number of IOs queued up */ + BLKIO_STAT_QUEUED, + /* All the single valued stats go below this */ + BLKIO_STAT_TIME, +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Time not charged to this cgroup */ + BLKIO_STAT_UNACCOUNTED_TIME, + BLKIO_STAT_AVG_QUEUE_SIZE, + BLKIO_STAT_IDLE_TIME, + BLKIO_STAT_EMPTY_TIME, + BLKIO_STAT_GROUP_WAIT_TIME, + BLKIO_STAT_DEQUEUE +#endif }; -struct blkcg_gq; - -struct blkcg { - struct cgroup_subsys_state css; - spinlock_t lock; - - struct radix_tree_root blkg_tree; - struct blkcg_gq *blkg_hint; - struct hlist_head blkg_list; - - /* for policies to test whether associated blkcg has changed */ - uint64_t id; - - /* TODO: per-policy storage in blkcg */ - unsigned int cfq_weight; /* belongs to cfq */ +/* Per cpu stats */ +enum stat_type_cpu { + BLKIO_STAT_CPU_SECTORS, + /* Total bytes transferred */ + BLKIO_STAT_CPU_SERVICE_BYTES, + /* Total IOs serviced, post merge */ + BLKIO_STAT_CPU_SERVICED, + /* Number of IOs merged */ + BLKIO_STAT_CPU_MERGED, + BLKIO_STAT_CPU_NR }; -struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; +enum stat_sub_type { + BLKIO_STAT_READ = 0, + BLKIO_STAT_WRITE, + BLKIO_STAT_SYNC, + BLKIO_STAT_ASYNC, + BLKIO_STAT_TOTAL }; -struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; +/* blkg state flags */ +enum blkg_state_flags { + BLKG_waiting = 0, + BLKG_idling, + BLKG_empty, }; -/* - * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a - * request_queue (q). This is used by blkcg policies which need to track - * information per blkcg - q pair. - * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. - */ -struct blkg_policy_data { - /* the blkg this per-policy data belongs to */ - struct blkcg_gq *blkg; - - /* used during policy activation */ - struct list_head alloc_node; +/* cgroup files owned by proportional weight policy */ +enum blkcg_file_name_prop { + BLKIO_PROP_weight = 1, + BLKIO_PROP_weight_device, + BLKIO_PROP_io_service_bytes, + BLKIO_PROP_io_serviced, + BLKIO_PROP_time, + BLKIO_PROP_sectors, + BLKIO_PROP_unaccounted_time, + BLKIO_PROP_io_service_time, + BLKIO_PROP_io_wait_time, + BLKIO_PROP_io_merged, + BLKIO_PROP_io_queued, + BLKIO_PROP_avg_queue_size, + BLKIO_PROP_group_wait_time, + BLKIO_PROP_idle_time, + BLKIO_PROP_empty_time, + BLKIO_PROP_dequeue, }; -/* association between a blk cgroup and a request queue */ -struct blkcg_gq { - /* Pointer to the associated request_queue */ - struct request_queue *q; - struct list_head q_node; - struct hlist_node blkcg_node; - struct blkcg *blkcg; - /* reference count */ - int refcnt; - - struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - - struct rcu_head rcu_head; +/* cgroup files owned by throttle policy */ +enum blkcg_file_name_throtl { + BLKIO_THROTL_read_bps_device, + BLKIO_THROTL_write_bps_device, + BLKIO_THROTL_read_iops_device, + BLKIO_THROTL_write_iops_device, + BLKIO_THROTL_io_service_bytes, + BLKIO_THROTL_io_serviced, }; -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); - -struct blkcg_policy { - int plid; - /* policy specific private data size */ - size_t pd_size; - /* cgroup files for the policy */ - struct cftype *cftypes; - - /* operations */ - blkcg_pol_init_pd_fn *pd_init_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; - blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; +struct blkio_cgroup { + struct cgroup_subsys_state css; + unsigned int weight; + spinlock_t lock; + struct hlist_head blkg_list; + struct list_head policy_list; /* list of blkio_policy_node */ }; -extern struct blkcg blkcg_root; - -struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup); -struct blkcg *bio_blkcg(struct bio *bio); -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); -int blkcg_init_queue(struct request_queue *q); -void blkcg_drain_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); - -/* Blkio controller policy registration */ -int blkcg_policy_register(struct blkcg_policy *pol); -void blkcg_policy_unregister(struct blkcg_policy *pol); -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol); -void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol); - -void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, - u64 (*prfill)(struct seq_file *, - struct blkg_policy_data *, int), - const struct blkcg_policy *pol, int data, - bool show_total); -u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat *rwstat); -u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); - -struct blkg_conf_ctx { - struct gendisk *disk; - struct blkcg_gq *blkg; - u64 v; +struct blkio_group_stats { + /* total disk time and nr sectors dispatched by this group */ + uint64_t time; + uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Time not charged to this cgroup */ + uint64_t unaccounted_time; + + /* Sum of number of IOs queued across all samples */ + uint64_t avg_queue_size_sum; + /* Count of samples taken for average */ + uint64_t avg_queue_size_samples; + /* How many times this group has been removed from service tree */ + unsigned long dequeue; + + /* Total time spent waiting for it to be assigned a timeslice. */ + uint64_t group_wait_time; + uint64_t start_group_wait_time; + + /* Time spent idling for this blkio_group */ + uint64_t idle_time; + uint64_t start_idle_time; + /* + * Total time when we have requests queued and do not contain the + * current active queue. + */ + uint64_t empty_time; + uint64_t start_empty_time; + uint16_t flags; +#endif }; -int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); -void blkg_conf_finish(struct blkg_conf_ctx *ctx); - - -/** - * blkg_to_pdata - get policy private data - * @blkg: blkg of interest - * @pol: policy of interest - * - * Return pointer to private data associated with the @blkg-@pol pair. - */ -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) -{ - return blkg ? blkg->pd[pol->plid] : NULL; -} - -/** - * pdata_to_blkg - get blkg associated with policy private data - * @pd: policy private data of interest - * - * @pd is policy private data. Determine the blkg it's associated with. - */ -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) -{ - return pd ? pd->blkg : NULL; -} - -/** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - int ret; - - rcu_read_lock(); - ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - rcu_read_unlock(); - if (ret) - strncpy(buf, "", buflen); - return ret; -} - -/** - * blkg_get - get a blkg reference - * @blkg: blkg to get - * - * The caller should be holding queue_lock and an existing reference. - */ -static inline void blkg_get(struct blkcg_gq *blkg) -{ - lockdep_assert_held(blkg->q->queue_lock); - WARN_ON_ONCE(!blkg->refcnt); - blkg->refcnt++; -} - -void __blkg_release(struct blkcg_gq *blkg); - -/** - * blkg_put - put a blkg reference - * @blkg: blkg to put - * - * The caller should be holding queue_lock. - */ -static inline void blkg_put(struct blkcg_gq *blkg) -{ - lockdep_assert_held(blkg->q->queue_lock); - WARN_ON_ONCE(blkg->refcnt <= 0); - if (!--blkg->refcnt) - __blkg_release(blkg); -} - -/** - * blkg_stat_add - add a value to a blkg_stat - * @stat: target blkg_stat - * @val: value to add - * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. - */ -static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) -{ - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); -} - -/** - * blkg_stat_read - read the current value of a blkg_stat - * @stat: blkg_stat to read - * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. - */ -static inline uint64_t blkg_stat_read(struct blkg_stat *stat) -{ - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry(&stat->syncp, start)); - - return v; -} - -/** - * blkg_stat_reset - reset a blkg_stat - * @stat: blkg_stat to reset - */ -static inline void blkg_stat_reset(struct blkg_stat *stat) -{ - stat->cnt = 0; -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @rw: mask of REQ_{WRITE|SYNC} - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int rw, uint64_t val) -{ - u64_stats_update_begin(&rwstat->syncp); - - if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; - else - rwstat->cnt[BLKG_RWSTAT_READ] += val; - if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; - else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; - - u64_stats_update_end(&rwstat->syncp); -} +/* Per cpu blkio group stats */ +struct blkio_group_stats_cpu { + uint64_t sectors; + uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; + struct u64_stats_sync syncp; +}; -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. - */ -static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) -{ - unsigned int start; - struct blkg_rwstat tmp; +struct blkio_group { + /* An rcu protected unique identifier for the group */ + void *key; + struct hlist_node blkcg_node; + unsigned short blkcg_id; + /* Store cgroup path */ + char path[128]; + /* The device MKDEV(major, minor), this group has been created for */ + dev_t dev; + /* policy which owns this blk group */ + enum blkio_policy_id plid; + + /* Need to serialize the stats in the case of reset/update */ + spinlock_t stats_lock; + struct blkio_group_stats stats; + /* Per cpu stats pointer */ + struct blkio_group_stats_cpu __percpu *stats_cpu; +}; - do { - start = u64_stats_fetch_begin(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry(&rwstat->syncp, start)); +struct blkio_policy_node { + struct list_head node; + dev_t dev; + /* This node belongs to max bw policy or porportional weight policy */ + enum blkio_policy_id plid; + /* cgroup file to which this rule belongs to */ + int fileid; + + union { + unsigned int weight; + /* + * Rate read/write in terms of bytes per second + * Whether this rate represents read or write is determined + * by file type "fileid". + */ + u64 bps; + unsigned int iops; + } val; +}; - return tmp; -} +extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, + dev_t dev); +extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, + dev_t dev); +extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, + dev_t dev); +extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, + dev_t dev); +extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, + dev_t dev); + +typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); + +typedef void (blkio_update_group_weight_fn) (void *key, + struct blkio_group *blkg, unsigned int weight); +typedef void (blkio_update_group_read_bps_fn) (void * key, + struct blkio_group *blkg, u64 read_bps); +typedef void (blkio_update_group_write_bps_fn) (void *key, + struct blkio_group *blkg, u64 write_bps); +typedef void (blkio_update_group_read_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int read_iops); +typedef void (blkio_update_group_write_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int write_iops); + +struct blkio_policy_ops { + blkio_unlink_group_fn *blkio_unlink_group_fn; + blkio_update_group_weight_fn *blkio_update_group_weight_fn; + blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; + blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; + blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; + blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; +}; -/** - * blkg_rwstat_sum - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); +struct blkio_policy_type { + struct list_head list; + struct blkio_policy_ops ops; + enum blkio_policy_id plid; +}; - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} +/* Blkio controller policy registration */ +extern void blkio_policy_register(struct blkio_policy_type *); +extern void blkio_policy_unregister(struct blkio_policy_type *); -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +static inline char *blkg_path(struct blkio_group *blkg) { - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); + return blkg->path; } -#else /* CONFIG_BLK_CGROUP */ +#else -struct cgroup; - -struct blkg_policy_data { +struct blkio_group { }; -struct blkcg_gq { +struct blkio_policy_type { }; -struct blkcg_policy { -}; - -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_drain_queue(struct request_queue *q) { } -static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } -static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } -static inline int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { return 0; } -static inline void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { } - -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) { return NULL; } -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } -static inline void blkg_get(struct blkcg_gq *blkg) { } -static inline void blkg_put(struct blkcg_gq *blkg) { } - -#endif /* CONFIG_BLK_CGROUP */ -#endif /* _BLK_CGROUP_H */ +static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } +static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } + +static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } + +#endif + +#define BLKIO_WEIGHT_MIN 10 +#define BLKIO_WEIGHT_MAX 1000 +#define BLKIO_WEIGHT_DEFAULT 500 + +#ifdef CONFIG_DEBUG_BLK_CGROUP +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue); +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); +void blkiocg_update_idle_time_stats(struct blkio_group *blkg); +void blkiocg_set_start_empty_time(struct blkio_group *blkg); + +#define BLKG_FLAG_FNS(name) \ +static inline void blkio_mark_blkg_##name( \ + struct blkio_group_stats *stats) \ +{ \ + stats->flags |= (1 << BLKG_##name); \ +} \ +static inline void blkio_clear_blkg_##name( \ + struct blkio_group_stats *stats) \ +{ \ + stats->flags &= ~(1 << BLKG_##name); \ +} \ +static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ +{ \ + return (stats->flags & (1 << BLKG_##name)) != 0; \ +} \ + +BLKG_FLAG_FNS(waiting) +BLKG_FLAG_FNS(idling) +BLKG_FLAG_FNS(empty) +#undef BLKG_FLAG_FNS +#else +static inline void blkiocg_update_avg_queue_size_stats( + struct blkio_group *blkg) {} +static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) {} +static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{} +static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} +static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} +#endif + +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) +extern struct blkio_cgroup blkio_root_cgroup; +extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); +extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); +extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid); +extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); +extern int blkiocg_del_blkio_group(struct blkio_group *blkg); +extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, + void *key); +void blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time, + unsigned long unaccounted_time); +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, + bool direction, bool sync); +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync); +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync); +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync); +#else +struct cgroup; +static inline struct blkio_cgroup * +cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } +static inline struct blkio_cgroup * +task_blkio_cgroup(struct task_struct *tsk) { return NULL; } + +static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) {} + +static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } + +static inline int +blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } + +static inline struct blkio_group * +blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } +static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time, + unsigned long unaccounted_time) +{} +static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) {} +static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, + bool sync) {} +static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) {} +static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +#endif +#endif /* _BLK_CGROUP_H */ diff --git a/trunk/block/blk-core.c b/trunk/block/blk-core.c index 3c923a7aeb56..1f61b74867e4 100644 --- a/trunk/block/blk-core.c +++ b/trunk/block/blk-core.c @@ -29,13 +29,11 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include #include "blk.h" -#include "blk-cgroup.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -282,7 +280,7 @@ EXPORT_SYMBOL(blk_stop_queue); * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevaotor_exit() - * and blkcg_exit_queue() to be called with queue lock initialized. + * and blk_throtl_exit() to be called with queue lock initialized. * */ void blk_sync_queue(struct request_queue *q) @@ -367,23 +365,17 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) spin_lock_irq(q->queue_lock); - /* - * The caller might be trying to drain @q before its - * elevator is initialized. - */ - if (q->elevator) - elv_drain_elevator(q); - - blkcg_drain_queue(q); + elv_drain_elevator(q); + if (drain_all) + blk_throtl_drain(q); /* * This function might be called on a queue which failed - * driver init after queue creation or is not yet fully - * active yet. Some drivers (e.g. fd and loop) get unhappy - * in such cases. Kick queue iff dispatch queue has - * something on it and @q has request_fn set. + * driver init after queue creation. Some drivers + * (e.g. fd) get unhappy in such cases. Kick queue iff + * dispatch queue has something on it. */ - if (!list_empty(&q->queue_head) && q->request_fn) + if (!list_empty(&q->queue_head)) __blk_run_queue(q); drain |= q->rq.elvpriv; @@ -410,49 +402,6 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) } } -/** - * blk_queue_bypass_start - enter queue bypass mode - * @q: queue of interest - * - * In bypass mode, only the dispatch FIFO queue of @q is used. This - * function makes @q enter bypass mode and drains all requests which were - * throttled or issued before. On return, it's guaranteed that no request - * is being throttled or has ELVPRIV set and blk_queue_bypass() %true - * inside queue or RCU read lock. - */ -void blk_queue_bypass_start(struct request_queue *q) -{ - bool drain; - - spin_lock_irq(q->queue_lock); - drain = !q->bypass_depth++; - queue_flag_set(QUEUE_FLAG_BYPASS, q); - spin_unlock_irq(q->queue_lock); - - if (drain) { - blk_drain_queue(q, false); - /* ensure blk_queue_bypass() is %true inside RCU read lock */ - synchronize_rcu(); - } -} -EXPORT_SYMBOL_GPL(blk_queue_bypass_start); - -/** - * blk_queue_bypass_end - leave queue bypass mode - * @q: queue of interest - * - * Leave bypass mode and restore the normal queueing behavior. - */ -void blk_queue_bypass_end(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - if (!--q->bypass_depth) - queue_flag_clear(QUEUE_FLAG_BYPASS, q); - WARN_ON_ONCE(q->bypass_depth < 0); - spin_unlock_irq(q->queue_lock); -} -EXPORT_SYMBOL_GPL(blk_queue_bypass_end); - /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -469,19 +418,6 @@ void blk_cleanup_queue(struct request_queue *q) queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); spin_lock_irq(lock); - - /* - * Dead queue is permanently in bypass mode till released. Note - * that, unlike blk_queue_bypass_start(), we aren't performing - * synchronize_rcu() after entering bypass mode to avoid the delay - * as some drivers create and destroy a lot of queues while - * probing. This is still safe because blk_release_queue() will be - * called only after the queue refcnt drops to zero and nothing, - * RCU or not, would be traversing the queue by then. - */ - q->bypass_depth++; - queue_flag_set(QUEUE_FLAG_BYPASS, q); - queue_flag_set(QUEUE_FLAG_NOMERGES, q); queue_flag_set(QUEUE_FLAG_NOXMERGES, q); queue_flag_set(QUEUE_FLAG_DEAD, q); @@ -492,8 +428,13 @@ void blk_cleanup_queue(struct request_queue *q) spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); - /* drain all requests queued before DEAD marking */ - blk_drain_queue(q, true); + /* + * Drain all requests queued before DEAD marking. The caller might + * be trying to tear down @q before its elevator is initialized, in + * which case we don't want to call into draining. + */ + if (q->elevator) + blk_drain_queue(q, true); /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); @@ -557,15 +498,14 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (err) goto fail_id; + if (blk_throtl_init(q)) + goto fail_id; + setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); - INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); -#ifdef CONFIG_BLK_CGROUP - INIT_LIST_HEAD(&q->blkg_list); -#endif INIT_LIST_HEAD(&q->flush_queue[0]); INIT_LIST_HEAD(&q->flush_queue[1]); INIT_LIST_HEAD(&q->flush_data_in_flight); @@ -582,18 +522,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) */ q->queue_lock = &q->__queue_lock; - /* - * A queue starts its life with bypass turned on to avoid - * unnecessary bypass on/off overhead and nasty surprises during - * init. The initial bypass will be finished at the end of - * blk_init_allocated_queue(). - */ - q->bypass_depth = 1; - __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); - - if (blkcg_init_queue(q)) - goto fail_id; - return q; fail_id: @@ -686,15 +614,15 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, q->sg_reserved_size = INT_MAX; - /* init elevator */ - if (elevator_init(q, NULL)) - return NULL; - - blk_queue_congestion_threshold(q); + /* + * all done + */ + if (!elevator_init(q, NULL)) { + blk_queue_congestion_threshold(q); + return q; + } - /* all done, end the initial bypass */ - blk_queue_bypass_end(q); - return q; + return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -720,6 +648,33 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) mempool_free(rq, q->rq.rq_pool); } +static struct request * +blk_alloc_request(struct request_queue *q, struct io_cq *icq, + unsigned int flags, gfp_t gfp_mask) +{ + struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); + + if (!rq) + return NULL; + + blk_rq_init(q, rq); + + rq->cmd_flags = flags | REQ_ALLOCED; + + if (flags & REQ_ELVPRIV) { + rq->elv.icq = icq; + if (unlikely(elv_set_request(q, rq, gfp_mask))) { + mempool_free(rq, q->rq.rq_pool); + return NULL; + } + /* @rq->elv.icq holds on to io_context until @rq is freed */ + if (icq) + get_io_context(icq->ioc); + } + + return rq; +} + /* * ioc_batching returns true if the ioc is a valid batching request and * should be given priority access to a request. @@ -807,22 +762,6 @@ static bool blk_rq_should_init_elevator(struct bio *bio) return true; } -/** - * rq_ioc - determine io_context for request allocation - * @bio: request being allocated is for this bio (can be %NULL) - * - * Determine io_context to use for request allocation for @bio. May return - * %NULL if %current->io_context doesn't exist. - */ -static struct io_context *rq_ioc(struct bio *bio) -{ -#ifdef CONFIG_BLK_CGROUP - if (bio && bio->bi_ioc) - return bio->bi_ioc; -#endif - return current->io_context; -} - /** * get_request - get a free request * @q: request_queue to allocate request from @@ -840,7 +779,7 @@ static struct io_context *rq_ioc(struct bio *bio) static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) { - struct request *rq; + struct request *rq = NULL; struct request_list *rl = &q->rq; struct elevator_type *et; struct io_context *ioc; @@ -850,7 +789,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, int may_queue; retry: et = q->elevator->type; - ioc = rq_ioc(bio); + ioc = current->io_context; if (unlikely(blk_queue_dead(q))) return NULL; @@ -869,7 +808,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, */ if (!ioc && !retried) { spin_unlock_irq(q->queue_lock); - create_io_context(gfp_mask, q->node); + create_io_context(current, gfp_mask, q->node); spin_lock_irq(q->queue_lock); retried = true; goto retry; @@ -892,7 +831,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * process is not a "batcher", and not * exempted by the IO scheduler */ - return NULL; + goto out; } } } @@ -905,7 +844,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * allocated with any setting of ->nr_requests */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) - return NULL; + goto out; rl->count[is_sync]++; rl->starved[is_sync] = 0; @@ -920,7 +859,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * Also, lookup icq while holding queue_lock. If it doesn't exist, * it will be created after releasing queue_lock. */ - if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { + if (blk_rq_should_init_elevator(bio) && + !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { rw_flags |= REQ_ELVPRIV; rl->elvpriv++; if (et->icq_cache && ioc) @@ -931,36 +871,41 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); - /* allocate and init request */ - rq = mempool_alloc(q->rq.rq_pool, gfp_mask); - if (!rq) - goto fail_alloc; + /* create icq if missing */ + if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) { + icq = ioc_create_icq(q, gfp_mask); + if (!icq) + goto fail_icq; + } - blk_rq_init(q, rq); - rq->cmd_flags = rw_flags | REQ_ALLOCED; - - /* init elvpriv */ - if (rw_flags & REQ_ELVPRIV) { - if (unlikely(et->icq_cache && !icq)) { - create_io_context(gfp_mask, q->node); - ioc = rq_ioc(bio); - if (!ioc) - goto fail_elvpriv; - - icq = ioc_create_icq(ioc, q, gfp_mask); - if (!icq) - goto fail_elvpriv; - } + rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); - rq->elv.icq = icq; - if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) - goto fail_elvpriv; +fail_icq: + if (unlikely(!rq)) { + /* + * Allocation failed presumably due to memory. Undo anything + * we might have messed up. + * + * Allocating task should really be put onto the front of the + * wait queue, but this is pretty rare. + */ + spin_lock_irq(q->queue_lock); + freed_request(q, rw_flags); - /* @rq->elv.icq holds io_context until @rq is freed */ - if (icq) - get_io_context(icq->ioc); + /* + * in the very unlikely event that allocation failed and no + * requests for this direction was pending, mark us starved + * so that freeing of a request in the other direction will + * notice us. another possible fix would be to split the + * rq mempool into READ and WRITE + */ +rq_starved: + if (unlikely(rl->count[is_sync] == 0)) + rl->starved[is_sync] = 1; + + goto out; } -out: + /* * ioc may be NULL here, and ioc_batching will be false. That's * OK, if the queue is under the request limit then requests need @@ -971,48 +916,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, ioc->nr_batch_requests--; trace_block_getrq(q, bio, rw_flags & 1); +out: return rq; - -fail_elvpriv: - /* - * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed - * and may fail indefinitely under memory pressure and thus - * shouldn't stall IO. Treat this request as !elvpriv. This will - * disturb iosched and blkcg but weird is bettern than dead. - */ - printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n", - dev_name(q->backing_dev_info.dev)); - - rq->cmd_flags &= ~REQ_ELVPRIV; - rq->elv.icq = NULL; - - spin_lock_irq(q->queue_lock); - rl->elvpriv--; - spin_unlock_irq(q->queue_lock); - goto out; - -fail_alloc: - /* - * Allocation failed presumably due to memory. Undo anything we - * might have messed up. - * - * Allocating task should really be put onto the front of the wait - * queue, but this is pretty rare. - */ - spin_lock_irq(q->queue_lock); - freed_request(q, rw_flags); - - /* - * in the very unlikely event that allocation failed and no - * requests for this direction was pending, mark us starved so that - * freeing of a request in the other direction will notice - * us. another possible fix would be to split the rq mempool into - * READ and WRITE - */ -rq_starved: - if (unlikely(rl->count[is_sync] == 0)) - rl->starved[is_sync] = 1; - return NULL; } /** @@ -1056,7 +961,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, * up to a big batch of them for a small period time. * See ioc_batching, ioc_set_batching */ - create_io_context(GFP_NOIO, q->node); + create_io_context(current, GFP_NOIO, q->node); ioc_set_batching(q, current->io_context); spin_lock_irq(q->queue_lock); diff --git a/trunk/block/blk-ioc.c b/trunk/block/blk-ioc.c index 1e2d53b04858..fb95dd2f889a 100644 --- a/trunk/block/blk-ioc.c +++ b/trunk/block/blk-ioc.c @@ -155,20 +155,20 @@ void put_io_context(struct io_context *ioc) } EXPORT_SYMBOL(put_io_context); -/** - * put_io_context_active - put active reference on ioc - * @ioc: ioc of interest - * - * Undo get_io_context_active(). If active reference reaches zero after - * put, @ioc can never issue further IOs and ioscheds are notified. - */ -void put_io_context_active(struct io_context *ioc) +/* Called by the exiting task */ +void exit_io_context(struct task_struct *task) { + struct io_context *ioc; + struct io_cq *icq; struct hlist_node *n; unsigned long flags; - struct io_cq *icq; - if (!atomic_dec_and_test(&ioc->active_ref)) { + task_lock(task); + ioc = task->io_context; + task->io_context = NULL; + task_unlock(task); + + if (!atomic_dec_and_test(&ioc->nr_tasks)) { put_io_context(ioc); return; } @@ -197,20 +197,6 @@ void put_io_context_active(struct io_context *ioc) put_io_context(ioc); } -/* Called by the exiting task */ -void exit_io_context(struct task_struct *task) -{ - struct io_context *ioc; - - task_lock(task); - ioc = task->io_context; - task->io_context = NULL; - task_unlock(task); - - atomic_dec(&ioc->nr_tasks); - put_io_context_active(ioc); -} - /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared @@ -232,18 +218,19 @@ void ioc_clear_queue(struct request_queue *q) } } -int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) +void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, + int node) { struct io_context *ioc; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) - return -ENOMEM; + return; /* initialize */ atomic_long_set(&ioc->refcount, 1); - atomic_set(&ioc->active_ref, 1); + atomic_set(&ioc->nr_tasks, 1); spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ioc->icq_list); @@ -263,8 +250,6 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) else kmem_cache_free(iocontext_cachep, ioc); task_unlock(task); - - return 0; } /** @@ -296,7 +281,7 @@ struct io_context *get_task_io_context(struct task_struct *task, return ioc; } task_unlock(task); - } while (!create_task_io_context(task, gfp_flags, node)); + } while (create_io_context(task, gfp_flags, node)); return NULL; } @@ -340,23 +325,26 @@ EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq - * @ioc: io_context of interest * @q: request_queue of interest * @gfp_mask: allocation mask * - * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they - * will be created using @gfp_mask. + * Make sure io_cq linking %current->io_context and @q exists. If either + * io_context and/or icq don't exist, they will be created using @gfp_mask. * * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask) +struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) { struct elevator_type *et = q->elevator->type; + struct io_context *ioc; struct io_cq *icq; /* allocate stuff */ + ioc = create_io_context(current, gfp_mask, q->node); + if (!ioc) + return NULL; + icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, q->node); if (!icq) @@ -394,6 +382,74 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, return icq; } +void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags) +{ + struct io_cq *icq; + struct hlist_node *n; + + hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) + icq->flags |= flags; +} + +/** + * ioc_ioprio_changed - notify ioprio change + * @ioc: io_context of interest + * @ioprio: new ioprio + * + * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all + * icq's. iosched is responsible for checking the bit and applying it on + * request issue path. + */ +void ioc_ioprio_changed(struct io_context *ioc, int ioprio) +{ + unsigned long flags; + + spin_lock_irqsave(&ioc->lock, flags); + ioc->ioprio = ioprio; + ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED); + spin_unlock_irqrestore(&ioc->lock, flags); +} + +/** + * ioc_cgroup_changed - notify cgroup change + * @ioc: io_context of interest + * + * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's. + * iosched is responsible for checking the bit and applying it on request + * issue path. + */ +void ioc_cgroup_changed(struct io_context *ioc) +{ + unsigned long flags; + + spin_lock_irqsave(&ioc->lock, flags); + ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED); + spin_unlock_irqrestore(&ioc->lock, flags); +} +EXPORT_SYMBOL(ioc_cgroup_changed); + +/** + * icq_get_changed - fetch and clear icq changed mask + * @icq: icq of interest + * + * Fetch and clear ICQ_*_CHANGED bits from @icq. Grabs and releases + * @icq->ioc->lock. + */ +unsigned icq_get_changed(struct io_cq *icq) +{ + unsigned int changed = 0; + unsigned long flags; + + if (unlikely(icq->flags & ICQ_CHANGED_MASK)) { + spin_lock_irqsave(&icq->ioc->lock, flags); + changed = icq->flags & ICQ_CHANGED_MASK; + icq->flags &= ~ICQ_CHANGED_MASK; + spin_unlock_irqrestore(&icq->ioc->lock, flags); + } + return changed; +} +EXPORT_SYMBOL(icq_get_changed); + static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/trunk/block/blk-sysfs.c b/trunk/block/blk-sysfs.c index aa41b47c22d2..cf150011d808 100644 --- a/trunk/block/blk-sysfs.c +++ b/trunk/block/blk-sysfs.c @@ -9,7 +9,6 @@ #include #include "blk.h" -#include "blk-cgroup.h" struct queue_sysfs_entry { struct attribute attr; @@ -480,8 +479,6 @@ static void blk_release_queue(struct kobject *kobj) blk_sync_queue(q); - blkcg_exit_queue(q); - if (q->elevator) { spin_lock_irq(q->queue_lock); ioc_clear_queue(q); @@ -489,12 +486,15 @@ static void blk_release_queue(struct kobject *kobj) elevator_exit(q->elevator); } + blk_throtl_exit(q); + if (rl->rq_pool) mempool_destroy(rl->rq_pool); if (q->queue_tags) __blk_queue_free_tags(q); + blk_throtl_release(q); blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/trunk/block/blk-throttle.c b/trunk/block/blk-throttle.c index 5b0659512047..f2ddb94626bd 100644 --- a/trunk/block/blk-throttle.c +++ b/trunk/block/blk-throttle.c @@ -21,8 +21,6 @@ static int throtl_quantum = 32; /* Throttling is performed over 100ms slice and after that slice is renewed */ static unsigned long throtl_slice = HZ/10; /* 100 ms */ -static struct blkcg_policy blkcg_policy_throtl; - /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; static void throtl_schedule_delayed_work(struct throtl_data *td, @@ -40,17 +38,9 @@ struct throtl_rb_root { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* Per-cpu group stats */ -struct tg_stats_cpu { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; -}; - struct throtl_grp { - /* must be the first member */ - struct blkg_policy_data pd; + /* List of throtl groups on the request queue*/ + struct hlist_node tg_node; /* active throtl group service_tree member */ struct rb_node rb_node; @@ -62,6 +52,8 @@ struct throtl_grp { */ unsigned long disptime; + struct blkio_group blkg; + atomic_t ref; unsigned int flags; /* Two lists for READ and WRITE */ @@ -88,18 +80,18 @@ struct throtl_grp { /* Some throttle limits got updated for the group */ int limits_changed; - /* Per cpu stats pointer */ - struct tg_stats_cpu __percpu *stats_cpu; - - /* List of tgs waiting for per cpu stats memory to be allocated */ - struct list_head stats_alloc_node; + struct rcu_head rcu_head; }; struct throtl_data { + /* List of throtl groups */ + struct hlist_head tg_list; + /* service tree for active throtl groups */ struct throtl_rb_root tg_service_tree; + struct throtl_grp *root_tg; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ @@ -116,33 +108,6 @@ struct throtl_data int limits_changed; }; -/* list and work item to allocate percpu group stats */ -static DEFINE_SPINLOCK(tg_stats_alloc_lock); -static LIST_HEAD(tg_stats_alloc_list); - -static void tg_stats_alloc_fn(struct work_struct *); -static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); - -static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct throtl_grp, pd) : NULL; -} - -static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) -{ - return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); -} - -static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) -{ - return pd_to_blkg(&tg->pd); -} - -static inline struct throtl_grp *td_root_tg(struct throtl_data *td) -{ - return blkg_to_tg(td->queue->root_blkg); -} - enum tg_state_flags { THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ }; @@ -163,150 +128,244 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg) \ THROTL_TG_FNS(on_rr); -#define throtl_log_tg(td, tg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ -} while (0) +#define throtl_log_tg(td, tg, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ + blkg_path(&(tg)->blkg), ##args); \ #define throtl_log(td, fmt, args...) \ blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) +static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct throtl_grp, blkg); + + return NULL; +} + static inline unsigned int total_nr_queued(struct throtl_data *td) { return td->nr_queued[0] + td->nr_queued[1]; } -/* - * Worker for allocating per cpu stat for tgs. This is scheduled on the - * system_nrt_wq once there are some groups on the alloc_list waiting for - * allocation. - */ -static void tg_stats_alloc_fn(struct work_struct *work) +static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) { - static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ - struct delayed_work *dwork = to_delayed_work(work); - bool empty = false; - -alloc_stats: - if (!stats_cpu) { - stats_cpu = alloc_percpu(struct tg_stats_cpu); - if (!stats_cpu) { - /* allocation failed, try again after some time */ - queue_delayed_work(system_nrt_wq, dwork, - msecs_to_jiffies(10)); - return; - } - } - - spin_lock_irq(&tg_stats_alloc_lock); + atomic_inc(&tg->ref); + return tg; +} - if (!list_empty(&tg_stats_alloc_list)) { - struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, - struct throtl_grp, - stats_alloc_node); - swap(tg->stats_cpu, stats_cpu); - list_del_init(&tg->stats_alloc_node); - } +static void throtl_free_tg(struct rcu_head *head) +{ + struct throtl_grp *tg; - empty = list_empty(&tg_stats_alloc_list); - spin_unlock_irq(&tg_stats_alloc_lock); - if (!empty) - goto alloc_stats; + tg = container_of(head, struct throtl_grp, rcu_head); + free_percpu(tg->blkg.stats_cpu); + kfree(tg); } -static void throtl_pd_init(struct blkcg_gq *blkg) +static void throtl_put_tg(struct throtl_grp *tg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; + BUG_ON(atomic_read(&tg->ref) <= 0); + if (!atomic_dec_and_test(&tg->ref)) + return; + + /* + * A group is freed in rcu manner. But having an rcu lock does not + * mean that one can access all the fields of blkg and assume these + * are valid. For example, don't try to follow throtl_data and + * request queue links. + * + * Having a reference to blkg under an rcu allows acess to only + * values local to groups like group stats and group rate limits + */ + call_rcu(&tg->rcu_head, throtl_free_tg); +} +static void throtl_init_group(struct throtl_grp *tg) +{ + INIT_HLIST_NODE(&tg->tg_node); RB_CLEAR_NODE(&tg->rb_node); bio_list_init(&tg->bio_lists[0]); bio_list_init(&tg->bio_lists[1]); tg->limits_changed = false; - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; + /* Practically unlimited BW */ + tg->bps[0] = tg->bps[1] = -1; + tg->iops[0] = tg->iops[1] = -1; /* - * Ugh... We need to perform per-cpu allocation for tg->stats_cpu - * but percpu allocator can't be called from IO path. Queue tg on - * tg_stats_alloc_list and allocate from work item. + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * request queue which will be dropped by either request queue + * exit or cgroup deletion path depending on who is exiting first. */ - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); - queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); + atomic_set(&tg->ref, 1); } -static void throtl_pd_exit(struct blkcg_gq *blkg) +/* Should be called with rcu read lock held (needed for blkcg) */ +static void +throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; +} - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_del_init(&tg->stats_alloc_node); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); +static void +__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) +{ + struct backing_dev_info *bdi = &td->queue->backing_dev_info; + unsigned int major, minor; - free_percpu(tg->stats_cpu); + if (!tg || tg->blkg.dev) + return; + + /* + * Fill in device details for a group which might not have been + * filled at group creation time as queue was being instantiated + * and driver had not attached a device yet + */ + if (bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + tg->blkg.dev = MKDEV(major, minor); + } } -static void throtl_pd_reset_stats(struct blkcg_gq *blkg) +/* + * Should be called with without queue lock held. Here queue lock will be + * taken rarely. It will be taken only once during life time of a group + * if need be + */ +static void +throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - int cpu; - - if (tg->stats_cpu == NULL) + if (!tg || tg->blkg.dev) return; - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); + spin_lock_irq(td->queue->queue_lock); + __throtl_tg_fill_dev_details(td, tg); + spin_unlock_irq(td->queue->queue_lock); +} + +static void throtl_init_add_tg_lists(struct throtl_data *td, + struct throtl_grp *tg, struct blkio_cgroup *blkcg) +{ + __throtl_tg_fill_dev_details(td, tg); + + /* Add group onto cgroup list */ + blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, + tg->blkg.dev, BLKIO_POLICY_THROTL); + + tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); + tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); + tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); + tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); + + throtl_add_group_to_td_list(td, tg); +} + +/* Should be called without queue lock and outside of rcu period */ +static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) +{ + struct throtl_grp *tg = NULL; + int ret; + + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); + if (!tg) + return NULL; + + ret = blkio_alloc_blkg_stats(&tg->blkg); - blkg_rwstat_reset(&sc->service_bytes); - blkg_rwstat_reset(&sc->serviced); + if (ret) { + kfree(tg); + return NULL; } + + throtl_init_group(tg); + return tg; } -static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, - struct blkcg *blkcg) +static struct +throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) { + struct throtl_grp *tg = NULL; + void *key = td; + /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) - return td_root_tg(td); + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case + */ + if (blkcg == &blkio_root_cgroup) + tg = td->root_tg; + else + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); + __throtl_tg_fill_dev_details(td, tg); + return tg; } -static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, - struct blkcg *blkcg) +static struct throtl_grp * throtl_get_tg(struct throtl_data *td) { + struct throtl_grp *tg = NULL, *__tg = NULL; + struct blkio_cgroup *blkcg; struct request_queue *q = td->queue; - struct throtl_grp *tg = NULL; + + /* no throttling for dead queue */ + if (unlikely(blk_queue_dead(q))) + return NULL; + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + tg = throtl_find_tg(td, blkcg); + if (tg) { + rcu_read_unlock(); + return tg; + } + + /* + * Need to allocate a group. Allocation of group also needs allocation + * of per cpu stats which in-turn takes a mutex() and can block. Hence + * we need to drop rcu lock and queue_lock before we call alloc. + */ + rcu_read_unlock(); + spin_unlock_irq(q->queue_lock); + + tg = throtl_alloc_tg(td); + + /* Group allocated and queue is still alive. take the lock */ + spin_lock_irq(q->queue_lock); + + /* Make sure @q is still alive */ + if (unlikely(blk_queue_dead(q))) { + kfree(tg); + return NULL; + } /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case + * Initialize the new group. After sleeping, read the blkcg again. */ - if (blkcg == &blkcg_root) { - tg = td_root_tg(td); - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - - /* if %NULL and @q is alive, fall back to root_tg */ - if (!IS_ERR(blkg)) - tg = blkg_to_tg(blkg); - else if (!blk_queue_dead(q)) - tg = td_root_tg(td); + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + + /* + * If some other thread already allocated the group while we were + * not holding queue lock, free up the group + */ + __tg = throtl_find_tg(td, blkcg); + + if (__tg) { + kfree(tg); + rcu_read_unlock(); + return __tg; + } + + /* Group allocation failed. Account the IO to root group */ + if (!tg) { + tg = td->root_tg; + return tg; } + throtl_init_add_tg_lists(td, tg, blkcg); + rcu_read_unlock(); return tg; } @@ -675,41 +734,16 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, - int rw) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - struct tg_stats_cpu *stats_cpu; - unsigned long flags; - - /* If per cpu stats are not allocated yet, don't do any accounting. */ - if (tg->stats_cpu == NULL) - return; - - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(tg->stats_cpu); - - blkg_rwstat_add(&stats_cpu->serviced, rw, 1); - blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); - - local_irq_restore(flags); -} - static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); + bool sync = rw_is_sync(bio->bi_rw); /* Charge the bio to the group */ tg->bytes_disp[rw] += bio->bi_size; tg->io_disp[rw]++; - throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -719,7 +753,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, bio_list_add(&tg->bio_lists[rw], bio); /* Take a bio reference on tg */ - blkg_get(tg_to_blkg(tg)); + throtl_ref_get_tg(tg); tg->nr_queued[rw]++; td->nr_queued[rw]++; throtl_enqueue_tg(td, tg); @@ -752,8 +786,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, bio = bio_list_pop(&tg->bio_lists[rw]); tg->nr_queued[rw]--; - /* Drop bio reference on blkg */ - blkg_put(tg_to_blkg(tg)); + /* Drop bio reference on tg */ + throtl_put_tg(tg); BUG_ON(td->nr_queued[rw] <= 0); td->nr_queued[rw]--; @@ -831,8 +865,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) static void throtl_process_limit_change(struct throtl_data *td) { - struct request_queue *q = td->queue; - struct blkcg_gq *blkg, *n; + struct throtl_grp *tg; + struct hlist_node *pos, *n; if (!td->limits_changed) return; @@ -841,9 +875,7 @@ static void throtl_process_limit_change(struct throtl_data *td) throtl_log(td, "limits changed"); - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { - struct throtl_grp *tg = blkg_to_tg(blkg); - + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { if (!tg->limits_changed) continue; @@ -941,158 +973,119 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) } } -static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, - struct blkg_policy_data *pd, int off) +static void +throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) { - struct throtl_grp *tg = pd_to_tg(pd); - struct blkg_rwstat rwstat = { }, tmp; - int i, cpu; + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&tg->tg_node)); - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - tmp = blkg_rwstat_read((void *)sc + off); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - rwstat.cnt[i] += tmp.cnt[i]; - } + hlist_del_init(&tg->tg_node); - return __blkg_prfill_rwstat(sf, pd, &rwstat); + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + throtl_put_tg(tg); + td->nr_undestroyed_grps--; } -static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static void throtl_release_tgs(struct throtl_data *td) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct hlist_node *pos, *n; + struct throtl_grp *tg; - blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, - cft->private, true); - return 0; + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&tg->blkg)) + throtl_destroy_tg(td, tg); + } } -static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, - int off) +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid throtl_data pointer as long as we are + * rcu read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if queue was going away, cgroup deltion + * path got to it first. + */ +void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) { - struct throtl_grp *tg = pd_to_tg(pd); - u64 v = *(u64 *)((void *)tg + off); + unsigned long flags; + struct throtl_data *td = key; - if (v == -1) - return 0; - return __blkg_prfill_u64(sf, pd, v); + spin_lock_irqsave(td->queue->queue_lock, flags); + throtl_destroy_tg(td, tg_of_blkg(blkg)); + spin_unlock_irqrestore(td->queue->queue_lock, flags); } -static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, - int off) +static void throtl_update_blkio_group_common(struct throtl_data *td, + struct throtl_grp *tg) { - struct throtl_grp *tg = pd_to_tg(pd); - unsigned int v = *(unsigned int *)((void *)tg + off); - - if (v == -1) - return 0; - return __blkg_prfill_u64(sf, pd, v); + xchg(&tg->limits_changed, true); + xchg(&td->limits_changed, true); + /* Schedule a work now to process the limit change */ + throtl_schedule_delayed_work(td, 0); } -static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +/* + * For all update functions, key should be a valid pointer because these + * update functions are called under blkcg_lock, that means, blkg is + * valid and in turn key is valid. queue exit path can not race because + * of blkcg_lock + * + * Can not take queue lock in update functions as queue lock under blkcg_lock + * is not allowed. Under other paths we take blkcg_lock under queue_lock. + */ +static void throtl_update_blkio_group_read_bps(void *key, + struct blkio_group *blkg, u64 read_bps) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, - &blkcg_policy_throtl, cft->private, false); - return 0; -} + struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); -static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) -{ - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, - &blkcg_policy_throtl, cft->private, false); - return 0; + tg->bps[READ] = read_bps; + throtl_update_blkio_group_common(td, tg); } -static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, - bool is_u64) +static void throtl_update_blkio_group_write_bps(void *key, + struct blkio_group *blkg, u64 write_bps) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); - struct blkg_conf_ctx ctx; - struct throtl_grp *tg; - struct throtl_data *td; - int ret; - - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); - if (ret) - return ret; - - tg = blkg_to_tg(ctx.blkg); - td = ctx.blkg->q->td; - - if (!ctx.v) - ctx.v = -1; - - if (is_u64) - *(u64 *)((void *)tg + cft->private) = ctx.v; - else - *(unsigned int *)((void *)tg + cft->private) = ctx.v; - - /* XXX: we don't need the following deferred processing */ - xchg(&tg->limits_changed, true); - xchg(&td->limits_changed, true); - throtl_schedule_delayed_work(td, 0); + struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); - blkg_conf_finish(&ctx); - return 0; + tg->bps[WRITE] = write_bps; + throtl_update_blkio_group_common(td, tg); } -static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static void throtl_update_blkio_group_read_iops(void *key, + struct blkio_group *blkg, unsigned int read_iops) { - return tg_set_conf(cgrp, cft, buf, true); + struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); + + tg->iops[READ] = read_iops; + throtl_update_blkio_group_common(td, tg); } -static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static void throtl_update_blkio_group_write_iops(void *key, + struct blkio_group *blkg, unsigned int write_iops) { - return tg_set_conf(cgrp, cft, buf, false); -} + struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); -static struct cftype throtl_files[] = { - { - .name = "throttle.read_bps_device", - .private = offsetof(struct throtl_grp, bps[READ]), - .read_seq_string = tg_print_conf_u64, - .write_string = tg_set_conf_u64, - .max_write_len = 256, - }, - { - .name = "throttle.write_bps_device", - .private = offsetof(struct throtl_grp, bps[WRITE]), - .read_seq_string = tg_print_conf_u64, - .write_string = tg_set_conf_u64, - .max_write_len = 256, - }, - { - .name = "throttle.read_iops_device", - .private = offsetof(struct throtl_grp, iops[READ]), - .read_seq_string = tg_print_conf_uint, - .write_string = tg_set_conf_uint, - .max_write_len = 256, - }, - { - .name = "throttle.write_iops_device", - .private = offsetof(struct throtl_grp, iops[WRITE]), - .read_seq_string = tg_print_conf_uint, - .write_string = tg_set_conf_uint, - .max_write_len = 256, - }, - { - .name = "throttle.io_service_bytes", - .private = offsetof(struct tg_stats_cpu, service_bytes), - .read_seq_string = tg_print_cpu_rwstat, - }, - { - .name = "throttle.io_serviced", - .private = offsetof(struct tg_stats_cpu, serviced), - .read_seq_string = tg_print_cpu_rwstat, - }, - { } /* terminate */ -}; + tg->iops[WRITE] = write_iops; + throtl_update_blkio_group_common(td, tg); +} static void throtl_shutdown_wq(struct request_queue *q) { @@ -1101,13 +1094,19 @@ static void throtl_shutdown_wq(struct request_queue *q) cancel_delayed_work_sync(&td->throtl_work); } -static struct blkcg_policy blkcg_policy_throtl = { - .pd_size = sizeof(struct throtl_grp), - .cftypes = throtl_files, - - .pd_init_fn = throtl_pd_init, - .pd_exit_fn = throtl_pd_exit, - .pd_reset_stats_fn = throtl_pd_reset_stats, +static struct blkio_policy_type blkio_policy_throtl = { + .ops = { + .blkio_unlink_group_fn = throtl_unlink_blkio_group, + .blkio_update_group_read_bps_fn = + throtl_update_blkio_group_read_bps, + .blkio_update_group_write_bps_fn = + throtl_update_blkio_group_write_bps, + .blkio_update_group_read_iops_fn = + throtl_update_blkio_group_read_iops, + .blkio_update_group_write_iops_fn = + throtl_update_blkio_group_write_iops, + }, + .plid = BLKIO_POLICY_THROTL, }; bool blk_throtl_bio(struct request_queue *q, struct bio *bio) @@ -1115,7 +1114,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) struct throtl_data *td = q->td; struct throtl_grp *tg; bool rw = bio_data_dir(bio), update_disptime = true; - struct blkcg *blkcg; + struct blkio_cgroup *blkcg; bool throttled = false; if (bio->bi_rw & REQ_THROTTLED) { @@ -1123,31 +1122,33 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) goto out; } - /* bio_associate_current() needs ioc, try creating */ - create_io_context(GFP_ATOMIC, q->node); - /* * A throtl_grp pointer retrieved under rcu can be used to access * basic fields like stats and io rates. If a group has no rules, * just update the dispatch stats in lockless manner and return. */ + rcu_read_lock(); - blkcg = bio_blkcg(bio); - tg = throtl_lookup_tg(td, blkcg); + blkcg = task_blkio_cgroup(current); + tg = throtl_find_tg(td, blkcg); if (tg) { + throtl_tg_fill_dev_details(td, tg); + if (tg_no_rule_group(tg, rw)) { - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_size, bio->bi_rw); - goto out_unlock_rcu; + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, + rw, rw_is_sync(bio->bi_rw)); + rcu_read_unlock(); + goto out; } } + rcu_read_unlock(); /* * Either group has not been allocated yet or it is not an unlimited * IO group */ spin_lock_irq(q->queue_lock); - tg = throtl_lookup_create_tg(td, blkcg); + tg = throtl_get_tg(td); if (unlikely(!tg)) goto out_unlock; @@ -1188,7 +1189,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) tg->io_disp[rw], tg->iops[rw], tg->nr_queued[READ], tg->nr_queued[WRITE]); - bio_associate_current(bio); throtl_add_bio_tg(q->td, tg, bio); throttled = true; @@ -1199,8 +1199,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) out_unlock: spin_unlock_irq(q->queue_lock); -out_unlock_rcu: - rcu_read_unlock(); out: return throttled; } @@ -1243,31 +1241,79 @@ void blk_throtl_drain(struct request_queue *q) int blk_throtl_init(struct request_queue *q) { struct throtl_data *td; - int ret; + struct throtl_grp *tg; td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; + INIT_HLIST_HEAD(&td->tg_list); td->tg_service_tree = THROTL_RB_ROOT; td->limits_changed = false; INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); - q->td = td; + /* alloc and Init root group. */ td->queue = q; + tg = throtl_alloc_tg(td); - /* activate policy */ - ret = blkcg_activate_policy(q, &blkcg_policy_throtl); - if (ret) + if (!tg) { kfree(td); - return ret; + return -ENOMEM; + } + + td->root_tg = tg; + + rcu_read_lock(); + throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); + rcu_read_unlock(); + + /* Attach throtl data to request queue */ + q->td = td; + return 0; } void blk_throtl_exit(struct request_queue *q) { - BUG_ON(!q->td); + struct throtl_data *td = q->td; + bool wait = false; + + BUG_ON(!td); + + throtl_shutdown_wq(q); + + spin_lock_irq(q->queue_lock); + throtl_release_tgs(td); + + /* If there are other groups */ + if (td->nr_undestroyed_grps > 0) + wait = true; + + spin_unlock_irq(q->queue_lock); + + /* + * Wait for tg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other undestroyed groups out + * there (other than root group). This can happen if cgroup deletion + * path claimed the responsibility of cleaning up a group before + * queue cleanup code get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + + /* + * Just being safe to make sure after previous flush if some body did + * update limits through cgroup and another work got queued, cancel + * it. + */ throtl_shutdown_wq(q); - blkcg_deactivate_policy(q, &blkcg_policy_throtl); +} + +void blk_throtl_release(struct request_queue *q) +{ kfree(q->td); } @@ -1277,7 +1323,8 @@ static int __init throtl_init(void) if (!kthrotld_workqueue) panic("Failed to create kthrotld\n"); - return blkcg_policy_register(&blkcg_policy_throtl); + blkio_policy_register(&blkio_policy_throtl); + return 0; } module_init(throtl_init); diff --git a/trunk/block/blk.h b/trunk/block/blk.h index 85f6ae42f7d3..d45be871329e 100644 --- a/trunk/block/blk.h +++ b/trunk/block/blk.h @@ -23,8 +23,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio); -void blk_queue_bypass_start(struct request_queue *q); -void blk_queue_bypass_end(struct request_queue *q); +void blk_drain_queue(struct request_queue *q, bool drain_all); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, @@ -145,6 +144,9 @@ void blk_queue_congestion_threshold(struct request_queue *q); int blk_dev_init(void); +void elv_quiesce_start(struct request_queue *q); +void elv_quiesce_end(struct request_queue *q); + /* * Return the threshold (number of used requests) at which the queue is @@ -184,30 +186,32 @@ static inline int blk_do_io_stat(struct request *rq) */ void get_io_context(struct io_context *ioc); struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask); +struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask); void ioc_clear_queue(struct request_queue *q); -int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); +void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask, + int node); /** * create_io_context - try to create task->io_context + * @task: target task * @gfp_mask: allocation mask * @node: allocation node * - * If %current->io_context is %NULL, allocate a new io_context and install - * it. Returns the current %current->io_context which may be %NULL if - * allocation failed. + * If @task->io_context is %NULL, allocate a new io_context and install it. + * Returns the current @task->io_context which may be %NULL if allocation + * failed. * * Note that this function can't be called with IRQ disabled because - * task_lock which protects %current->io_context is IRQ-unsafe. + * task_lock which protects @task->io_context is IRQ-unsafe. */ -static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) +static inline struct io_context *create_io_context(struct task_struct *task, + gfp_t gfp_mask, int node) { WARN_ON_ONCE(irqs_disabled()); - if (unlikely(!current->io_context)) - create_task_io_context(current, gfp_mask, node); - return current->io_context; + if (unlikely(!task->io_context)) + create_io_context_slowpath(task, gfp_mask, node); + return task->io_context; } /* @@ -218,6 +222,7 @@ extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); +extern void blk_throtl_release(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) { @@ -226,6 +231,7 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } +static inline void blk_throtl_release(struct request_queue *q) { } #endif /* CONFIG_BLK_DEV_THROTTLING */ #endif /* BLK_INTERNAL_H */ diff --git a/trunk/block/cfq-iosched.c b/trunk/block/cfq-iosched.c index 673c977cc2bf..3c38536bd52c 100644 --- a/trunk/block/cfq-iosched.c +++ b/trunk/block/cfq-iosched.c @@ -15,9 +15,7 @@ #include #include #include "blk.h" -#include "blk-cgroup.h" - -static struct blkcg_policy blkcg_policy_cfq __maybe_unused; +#include "cfq.h" /* * tunables @@ -173,53 +171,8 @@ enum wl_type_t { SYNC_WORKLOAD = 2 }; -struct cfqg_stats { -#ifdef CONFIG_CFQ_GROUP_IOSCHED - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ - struct blkg_rwstat service_time; - /* total time spent waiting in scheduler queue in ns */ - struct blkg_rwstat wait_time; - /* number of IOs queued up */ - struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; - /* total disk time and nr sectors dispatched by this group */ - struct blkg_stat time; -#ifdef CONFIG_DEBUG_BLK_CGROUP - /* time not charged to this cgroup */ - struct blkg_stat unaccounted_time; - /* sum of number of ios queued across all samples */ - struct blkg_stat avg_queue_size_sum; - /* count of samples taken for average */ - struct blkg_stat avg_queue_size_samples; - /* how many times this group has been removed from service tree */ - struct blkg_stat dequeue; - /* total time spent waiting for it to be assigned a timeslice. */ - struct blkg_stat group_wait_time; - /* time spent idling for this blkcg_gq */ - struct blkg_stat idle_time; - /* total time with empty current active q with other requests queued */ - struct blkg_stat empty_time; - /* fields after this shouldn't be cleared on stat reset */ - uint64_t start_group_wait_time; - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; -#endif /* CONFIG_DEBUG_BLK_CGROUP */ -#endif /* CONFIG_CFQ_GROUP_IOSCHED */ -}; - /* This is per cgroup per device grouping structure */ struct cfq_group { - /* must be the first member */ - struct blkg_policy_data pd; - /* group service_tree member */ struct rb_node rb_node; @@ -227,7 +180,7 @@ struct cfq_group { u64 vdisktime; unsigned int weight; unsigned int new_weight; - unsigned int dev_weight; + bool needs_update; /* number of cfqq currently on this group */ int nr_cfqq; @@ -253,21 +206,20 @@ struct cfq_group { unsigned long saved_workload_slice; enum wl_type_t saved_workload; enum wl_prio_t saved_serving_prio; - + struct blkio_group blkg; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + struct hlist_node cfqd_node; + int ref; +#endif /* number of requests that are on the dispatch list or inside driver */ int dispatched; struct cfq_ttime ttime; - struct cfqg_stats stats; }; struct cfq_io_cq { struct io_cq icq; /* must be the first member */ struct cfq_queue *cfqq[2]; struct cfq_ttime ttime; - int ioprio; /* the current ioprio */ -#ifdef CONFIG_CFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ -#endif }; /* @@ -277,7 +229,7 @@ struct cfq_data { struct request_queue *queue; /* Root service tree for cfq_groups */ struct cfq_rb_root grp_service_tree; - struct cfq_group *root_group; + struct cfq_group root_group; /* * The priority currently being served @@ -351,6 +303,12 @@ struct cfq_data { struct cfq_queue oom_cfqq; unsigned long last_delayed_sync; + + /* List of cfq groups being managed on this device*/ + struct hlist_head cfqg_list; + + /* Number of groups which are on blkcg->blkg_list */ + unsigned int nr_blkcg_linked_grps; }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); @@ -413,284 +371,21 @@ CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS -static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct cfq_group, pd) : NULL; -} - -static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) -{ - return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); -} - -static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) -{ - return pd_to_blkg(&cfqg->pd); -} - -#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - -/* cfqg stats flags */ -enum cfqg_stats_flags { - CFQG_stats_waiting = 0, - CFQG_stats_idling, - CFQG_stats_empty, -}; - -#define CFQG_FLAG_FNS(name) \ -static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ -{ \ - stats->flags |= (1 << CFQG_stats_##name); \ -} \ -static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ -{ \ - stats->flags &= ~(1 << CFQG_stats_##name); \ -} \ -static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ -{ \ - return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ -} \ - -CFQG_FLAG_FNS(waiting) -CFQG_FLAG_FNS(idling) -CFQG_FLAG_FNS(empty) -#undef CFQG_FLAG_FNS - -/* This should be called with the queue_lock held. */ -static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) -{ - unsigned long long now; - - if (!cfqg_stats_waiting(stats)) - return; - - now = sched_clock(); - if (time_after64(now, stats->start_group_wait_time)) - blkg_stat_add(&stats->group_wait_time, - now - stats->start_group_wait_time); - cfqg_stats_clear_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - if (cfqg_stats_waiting(stats)) - return; - if (cfqg == curr_cfqg) - return; - stats->start_group_wait_time = sched_clock(); - cfqg_stats_mark_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) -{ - unsigned long long now; - - if (!cfqg_stats_empty(stats)) - return; - - now = sched_clock(); - if (time_after64(now, stats->start_empty_time)) - blkg_stat_add(&stats->empty_time, - now - stats->start_empty_time); - cfqg_stats_clear_empty(stats); -} - -static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) -{ - blkg_stat_add(&cfqg->stats.dequeue, 1); -} - -static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - if (blkg_rwstat_sum(&stats->queued)) - return; - - /* - * group is already marked empty. This can happen if cfqq got new - * request in parent group and moved to this group while being added - * to service tree. Just ignore the event and move on. - */ - if (cfqg_stats_empty(stats)) - return; - - stats->start_empty_time = sched_clock(); - cfqg_stats_mark_empty(stats); -} - -static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - if (cfqg_stats_idling(stats)) { - unsigned long long now = sched_clock(); - - if (time_after64(now, stats->start_idle_time)) - blkg_stat_add(&stats->idle_time, - now - stats->start_idle_time); - cfqg_stats_clear_idling(stats); - } -} - -static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - BUG_ON(cfqg_stats_idling(stats)); - - stats->start_idle_time = sched_clock(); - cfqg_stats_mark_idling(stats); -} - -static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - blkg_stat_add(&stats->avg_queue_size_sum, - blkg_rwstat_sum(&stats->queued)); - blkg_stat_add(&stats->avg_queue_size_samples, 1); - cfqg_stats_update_group_wait_time(stats); -} - -#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ - -static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } -static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } -static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } -static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } -static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } -static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } -static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } - -#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ - #ifdef CONFIG_CFQ_GROUP_IOSCHED - -static inline void cfqg_get(struct cfq_group *cfqg) -{ - return blkg_get(cfqg_to_blkg(cfqg)); -} - -static inline void cfqg_put(struct cfq_group *cfqg) -{ - return blkg_put(cfqg_to_blkg(cfqg)); -} - -#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - __pbuf, ##args); \ -} while (0) + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blkg_path(&(cfqq)->cfqg->blkg), ##args) -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ -} while (0) - -static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int rw) -{ - blkg_rwstat_add(&cfqg->stats.queued, rw, 1); - cfqg_stats_end_empty_time(&cfqg->stats); - cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); -} - -static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, - unsigned long time, unsigned long unaccounted_time) -{ - blkg_stat_add(&cfqg->stats.time, time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); -#endif -} - -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) -{ - blkg_rwstat_add(&cfqg->stats.queued, rw, -1); -} - -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) -{ - blkg_rwstat_add(&cfqg->stats.merged, rw, 1); -} - -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); -} - -static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int rw) -{ - struct cfqg_stats *stats = &cfqg->stats; - unsigned long long now = sched_clock(); - - if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); - if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, rw, - io_start_time - start_time); -} - -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) -{ - struct cfq_group *cfqg = blkg_to_cfqg(blkg); - struct cfqg_stats *stats = &cfqg->stats; - - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); - blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_reset(&stats->unaccounted_time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); -#endif -} - -#else /* CONFIG_CFQ_GROUP_IOSCHED */ - -static inline void cfqg_get(struct cfq_group *cfqg) { } -static inline void cfqg_put(struct cfq_group *cfqg) { } +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ + blkg_path(&(cfqg)->blkg), ##args) \ +#else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) - -static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int rw) { } -static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, - unsigned long time, unsigned long unaccounted_time) { } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) { } -static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int rw) { } - -#endif /* CONFIG_CFQ_GROUP_IOSCHED */ - +#endif #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) @@ -771,9 +466,8 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, } static void cfq_dispatch_insert(struct request_queue *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio, - gfp_t gfp_mask); +static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, + struct io_context *, gfp_t); static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { @@ -851,7 +545,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) { u64 d = delta << CFQ_SERVICE_SHIFT; - d = d * CFQ_WEIGHT_DEFAULT; + d = d * BLKIO_WEIGHT_DEFAULT; do_div(d, cfqg->weight); return d; } @@ -1178,9 +872,9 @@ static void cfq_update_group_weight(struct cfq_group *cfqg) { BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - if (cfqg->new_weight) { + if (cfqg->needs_update) { cfqg->weight = cfqg->new_weight; - cfqg->new_weight = 0; + cfqg->needs_update = false; } } @@ -1242,7 +936,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); cfq_group_service_tree_del(st, cfqg); cfqg->saved_workload_slice = 0; - cfqg_stats_update_dequeue(cfqg); + cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); } static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, @@ -1314,59 +1008,178 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd), cfqq->nr_sectors); - cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); - cfqg_stats_set_start_empty_time(cfqg); + cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, + unaccounted_sl); + cfq_blkiocg_set_start_empty_time(&cfqg->blkg); } -/** - * cfq_init_cfqg_base - initialize base part of a cfq_group - * @cfqg: cfq_group to initialize - * - * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED - * is enabled or not. +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct cfq_group, blkg); + return NULL; +} + +static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, + unsigned int weight) +{ + struct cfq_group *cfqg = cfqg_of_blkg(blkg); + cfqg->new_weight = weight; + cfqg->needs_update = true; +} + +static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, + struct cfq_group *cfqg, struct blkio_cgroup *blkcg) +{ + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + + /* + * Add group onto cgroup list. It might happen that bdi->dev is + * not initialized yet. Initialize this new group without major + * and minor info and this info will be filled in once a new thread + * comes for IO. + */ + if (bdi->dev) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, + (void *)cfqd, MKDEV(major, minor)); + } else + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, + (void *)cfqd, 0); + + cfqd->nr_blkcg_linked_grps++; + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); + + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); +} + +/* + * Should be called from sleepable context. No request queue lock as per + * cpu stats are allocated dynamically and alloc_percpu needs to be called + * from sleepable context. */ -static void cfq_init_cfqg_base(struct cfq_group *cfqg) +static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) { + struct cfq_group *cfqg = NULL; + int i, j, ret; struct cfq_rb_root *st; - int i, j; + + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); + if (!cfqg) + return NULL; for_each_cfqg_st(cfqg, i, j, st) *st = CFQ_RB_ROOT; RB_CLEAR_NODE(&cfqg->rb_node); cfqg->ttime.last_end_request = jiffies; + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * elevator which will be dropped by either elevator exit + * or cgroup deletion path depending on who is exiting first. + */ + cfqg->ref = 1; + + ret = blkio_alloc_blkg_stats(&cfqg->blkg); + if (ret) { + kfree(cfqg); + return NULL; + } + + return cfqg; } -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static void cfq_pd_init(struct blkcg_gq *blkg) +static struct cfq_group * +cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = NULL; + void *key = cfqd; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + + /* + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case + */ + if (blkcg == &blkio_root_cgroup) + cfqg = &cfqd->root_group; + else + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfqg->blkg.dev = MKDEV(major, minor); + } - cfq_init_cfqg_base(cfqg); - cfqg->weight = blkg->blkcg->cfq_weight; + return cfqg; } /* * Search for the cfq group current task belongs to. request_queue lock must * be held. */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) { + struct blkio_cgroup *blkcg; + struct cfq_group *cfqg = NULL, *__cfqg = NULL; struct request_queue *q = cfqd->queue; - struct cfq_group *cfqg = NULL; - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - cfqg = cfqd->root_group; - } else { - struct blkcg_gq *blkg; + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + cfqg = cfq_find_cfqg(cfqd, blkcg); + if (cfqg) { + rcu_read_unlock(); + return cfqg; + } + + /* + * Need to allocate a group. Allocation of group also needs allocation + * of per cpu stats which in-turn takes a mutex() and can block. Hence + * we need to drop rcu lock and queue_lock before we call alloc. + * + * Not taking any queue reference here and assuming that queue is + * around by the time we return. CFQ queue allocation code does + * the same. It might be racy though. + */ + + rcu_read_unlock(); + spin_unlock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - cfqg = blkg_to_cfqg(blkg); + cfqg = cfq_alloc_cfqg(cfqd); + + spin_lock_irq(q->queue_lock); + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + + /* + * If some other thread already allocated the group while we were + * not holding queue lock, free up the group + */ + __cfqg = cfq_find_cfqg(cfqd, blkcg); + + if (__cfqg) { + kfree(cfqg); + rcu_read_unlock(); + return __cfqg; } + if (!cfqg) + cfqg = &cfqd->root_group; + + cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); + rcu_read_unlock(); + return cfqg; +} + +static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) +{ + cfqg->ref++; return cfqg; } @@ -1374,224 +1187,94 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { /* Currently, all async queues are mapped to root group */ if (!cfq_cfqq_sync(cfqq)) - cfqg = cfqq->cfqd->root_group; + cfqg = &cfqq->cfqd->root_group; cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ - cfqg_get(cfqg); + cfqq->cfqg->ref++; } -static u64 cfqg_prfill_weight_device(struct seq_file *sf, - struct blkg_policy_data *pd, int off) +static void cfq_put_cfqg(struct cfq_group *cfqg) { - struct cfq_group *cfqg = pd_to_cfqg(pd); - - if (!cfqg->dev_weight) - return 0; - return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); -} + struct cfq_rb_root *st; + int i, j; -static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) -{ - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), - cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, - false); - return 0; + BUG_ON(cfqg->ref <= 0); + cfqg->ref--; + if (cfqg->ref) + return; + for_each_cfqg_st(cfqg, i, j, st) + BUG_ON(!RB_EMPTY_ROOT(&st->rb)); + free_percpu(cfqg->blkg.stats_cpu); + kfree(cfqg); } -static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) { - seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); - return 0; -} + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); -static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) -{ - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); - struct blkg_conf_ctx ctx; - struct cfq_group *cfqg; - int ret; + hlist_del_init(&cfqg->cfqd_node); - ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); - if (ret) - return ret; + BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); + cfqd->nr_blkcg_linked_grps--; - ret = -EINVAL; - cfqg = blkg_to_cfqg(ctx.blkg); - if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight; - ret = 0; - } - - blkg_conf_finish(&ctx); - return ret; + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + cfq_put_cfqg(cfqg); } -static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static void cfq_release_cfq_groups(struct cfq_data *cfqd) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); - struct blkcg_gq *blkg; - struct hlist_node *n; - - if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) - return -EINVAL; - - spin_lock_irq(&blkcg->lock); - blkcg->cfq_weight = (unsigned int)val; - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct hlist_node *pos, *n; + struct cfq_group *cfqg; - if (cfqg && !cfqg->dev_weight) - cfqg->new_weight = blkcg->cfq_weight; + hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) + cfq_destroy_cfqg(cfqd, cfqg); } - - spin_unlock_irq(&blkcg->lock); - return 0; -} - -static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) -{ - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); - - blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, - cft->private, false); - return 0; } -static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu + * read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if elevator was exiting, cgroup deltion + * path got to it first. + */ +static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + unsigned long flags; + struct cfq_data *cfqd = key; - blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, - cft->private, true); - return 0; + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); } -#ifdef CONFIG_DEBUG_BLK_CGROUP -static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, - struct blkg_policy_data *pd, int off) +#else /* GROUP_IOSCHED */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) { - struct cfq_group *cfqg = pd_to_cfqg(pd); - u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); - u64 v = 0; - - if (samples) { - v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); - do_div(v, samples); - } - __blkg_prfill_u64(sf, pd, v); - return 0; + return &cfqd->root_group; } -/* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); - - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, - &blkcg_policy_cfq, 0, false); - return 0; -} -#endif /* CONFIG_DEBUG_BLK_CGROUP */ - -static struct cftype cfq_blkcg_files[] = { - { - .name = "weight_device", - .read_seq_string = cfqg_print_weight_device, - .write_string = cfqg_set_weight_device, - .max_write_len = 256, - }, - { - .name = "weight", - .read_seq_string = cfq_print_weight, - .write_u64 = cfq_set_weight, - }, - { - .name = "time", - .private = offsetof(struct cfq_group, stats.time), - .read_seq_string = cfqg_print_stat, - }, - { - .name = "sectors", - .private = offsetof(struct cfq_group, stats.sectors), - .read_seq_string = cfqg_print_stat, - }, - { - .name = "io_service_bytes", - .private = offsetof(struct cfq_group, stats.service_bytes), - .read_seq_string = cfqg_print_rwstat, - }, - { - .name = "io_serviced", - .private = offsetof(struct cfq_group, stats.serviced), - .read_seq_string = cfqg_print_rwstat, - }, - { - .name = "io_service_time", - .private = offsetof(struct cfq_group, stats.service_time), - .read_seq_string = cfqg_print_rwstat, - }, - { - .name = "io_wait_time", - .private = offsetof(struct cfq_group, stats.wait_time), - .read_seq_string = cfqg_print_rwstat, - }, - { - .name = "io_merged", - .private = offsetof(struct cfq_group, stats.merged), - .read_seq_string = cfqg_print_rwstat, - }, - { - .name = "io_queued", - .private = offsetof(struct cfq_group, stats.queued), - .read_seq_string = cfqg_print_rwstat, - }, -#ifdef CONFIG_DEBUG_BLK_CGROUP - { - .name = "avg_queue_size", - .read_seq_string = cfqg_print_avg_queue_size, - }, - { - .name = "group_wait_time", - .private = offsetof(struct cfq_group, stats.group_wait_time), - .read_seq_string = cfqg_print_stat, - }, - { - .name = "idle_time", - .private = offsetof(struct cfq_group, stats.idle_time), - .read_seq_string = cfqg_print_stat, - }, - { - .name = "empty_time", - .private = offsetof(struct cfq_group, stats.empty_time), - .read_seq_string = cfqg_print_stat, - }, - { - .name = "dequeue", - .private = offsetof(struct cfq_group, stats.dequeue), - .read_seq_string = cfqg_print_stat, - }, - { - .name = "unaccounted_time", - .private = offsetof(struct cfq_group, stats.unaccounted_time), - .read_seq_string = cfqg_print_stat, - }, -#endif /* CONFIG_DEBUG_BLK_CGROUP */ - { } /* terminate */ -}; -#else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) -{ - return cfqd->root_group; + return cfqg; } static inline void @@ -1599,6 +1282,9 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { cfqq->cfqg = cfqg; } +static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} +static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} + #endif /* GROUP_IOSCHED */ /* @@ -1865,10 +1551,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); + cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(rq), rq_is_sync(rq)); cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, - rq->cmd_flags); + cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, + &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), + rq_is_sync(rq)); } static struct request * @@ -1924,7 +1612,8 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); + cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(rq), rq_is_sync(rq)); if (rq->cmd_flags & REQ_PRIO) { WARN_ON(!cfqq->prio_pending); cfqq->prio_pending--; @@ -1959,7 +1648,8 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, static void cfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw); + cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, + bio_data_dir(bio), cfq_bio_sync(bio)); } static void @@ -1981,7 +1671,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); - cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); + cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(next), rq_is_sync(next)); cfqq = RQ_CFQQ(next); /* @@ -2022,7 +1713,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) { del_timer(&cfqd->idle_slice_timer); - cfqg_stats_update_idle_time(cfqq->cfqg); + cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); } static void __cfq_set_active_queue(struct cfq_data *cfqd, @@ -2031,7 +1722,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, if (cfqq) { cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", cfqd->serving_prio, cfqd->serving_type); - cfqg_stats_update_avg_queue_size(cfqq->cfqg); + cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); cfqq->slice_start = 0; cfqq->dispatch_start = jiffies; cfqq->allocated_slice = 0; @@ -2352,7 +2043,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * task has exited, don't wait */ cic = cfqd->active_cic; - if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) + if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks)) return; /* @@ -2379,7 +2070,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) sl = cfqd->cfq_slice_idle; mod_timer(&cfqd->idle_slice_timer, jiffies + sl); - cfqg_stats_set_start_idle_time(cfqq->cfqg); + cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, group_idle ? 1 : 0); } @@ -2402,7 +2093,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); + cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), + rq_data_dir(rq), rq_is_sync(rq)); } /* @@ -2985,7 +2677,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); - cfqg_put(cfqg); + cfq_put_cfqg(cfqg); } static void cfq_put_cooperator(struct cfq_queue *cfqq) @@ -3044,7 +2736,7 @@ static void cfq_exit_icq(struct io_cq *icq) } } -static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) +static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) { struct task_struct *tsk = current; int ioprio_class; @@ -3052,7 +2744,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) if (!cfq_cfqq_prio_changed(cfqq)) return; - ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); + ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); switch (ioprio_class) { default: printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); @@ -3064,11 +2756,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) cfqq->ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); + cfqq->ioprio = task_ioprio(ioc); cfqq->ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); + cfqq->ioprio = task_ioprio(ioc); cfqq->ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: @@ -3086,24 +2778,19 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) cfq_clear_cfqq_prio_changed(cfqq); } -static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) +static void changed_ioprio(struct cfq_io_cq *cic) { - int ioprio = cic->icq.ioc->ioprio; struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; - /* - * Check whether ioprio has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ - if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) + if (unlikely(!cfqd)) return; cfqq = cic->cfqq[BLK_RW_ASYNC]; if (cfqq) { struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, - GFP_ATOMIC); + new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc, + GFP_ATOMIC); if (new_cfqq) { cic->cfqq[BLK_RW_ASYNC] = new_cfqq; cfq_put_queue(cfqq); @@ -3113,8 +2800,6 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) cfqq = cic->cfqq[BLK_RW_SYNC]; if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); - - cic->ioprio = ioprio; } static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, @@ -3138,24 +2823,17 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) +static void changed_cgroup(struct cfq_io_cq *cic) { + struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *sync_cfqq; - uint64_t id; + struct request_queue *q; - rcu_read_lock(); - id = bio_blkcg(bio)->id; - rcu_read_unlock(); - - /* - * Check whether blkcg has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ - if (unlikely(!cfqd) || likely(cic->blkcg_id == id)) + if (unlikely(!cfqd)) return; - sync_cfqq = cic_to_cfqq(cic, 1); + q = cfqd->queue; + if (sync_cfqq) { /* * Drop reference to sync queue. A new sync queue will be @@ -3165,26 +2843,21 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) cic_set_cfqq(cic, NULL, 1); cfq_put_queue(sync_cfqq); } - - cic->blkcg_id = id; } -#else -static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) +cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, + struct io_context *ioc, gfp_t gfp_mask) { - struct blkcg *blkcg; struct cfq_queue *cfqq, *new_cfqq = NULL; + struct cfq_io_cq *cic; struct cfq_group *cfqg; retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); + cfqg = cfq_get_cfqg(cfqd); + cic = cfq_cic_lookup(cfqd, ioc); + /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); /* @@ -3197,7 +2870,6 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, cfqq = new_cfqq; new_cfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { - rcu_read_unlock(); spin_unlock_irq(cfqd->queue->queue_lock); new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask | __GFP_ZERO, @@ -3213,7 +2885,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, cic); + cfq_init_prio_data(cfqq, ioc); cfq_link_cfqq_cfqg(cfqq, cfqg); cfq_log_cfqq(cfqd, cfqq, "alloced"); } else @@ -3223,7 +2895,6 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, if (new_cfqq) kmem_cache_free(cfq_pool, new_cfqq); - rcu_read_unlock(); return cfqq; } @@ -3233,9 +2904,6 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) switch (ioprio_class) { case IOPRIO_CLASS_RT: return &cfqd->async_cfqq[0][ioprio]; - case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; - /* fall through */ case IOPRIO_CLASS_BE: return &cfqd->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: @@ -3246,11 +2914,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) } static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) +cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, + gfp_t gfp_mask) { - const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); - const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); + const int ioprio = task_ioprio(ioc); + const int ioprio_class = task_ioprio_class(ioc); struct cfq_queue **async_cfqq = NULL; struct cfq_queue *cfqq = NULL; @@ -3260,7 +2928,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, } if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); /* * pin the queue now that it's allocated, scheduler exit will prune it @@ -3342,7 +3010,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) enable_idle = 0; - else if (!atomic_read(&cic->icq.ioc->active_ref) || + else if (!atomic_read(&cic->icq.ioc->nr_tasks) || !cfqd->cfq_slice_idle || (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; @@ -3506,7 +3174,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_clear_cfqq_wait_request(cfqq); __blk_run_queue(cfqd->queue); } else { - cfqg_stats_update_idle_time(cfqq->cfqg); + cfq_blkiocg_update_idle_time_stats( + &cfqq->cfqg->blkg); cfq_mark_cfqq_must_dispatch(cfqq); } } @@ -3528,13 +3197,14 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) struct cfq_queue *cfqq = RQ_CFQQ(rq); cfq_log_cfqq(cfqd, cfqq, "insert_request"); - cfq_init_prio_data(cfqq, RQ_CIC(rq)); + cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc); rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, - rq->cmd_flags); + cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, + &cfqd->serving_group->blkg, rq_data_dir(rq), + rq_is_sync(rq)); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -3630,8 +3300,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->rq_in_driver--; cfqq->dispatched--; (RQ_CFQG(rq))->dispatched--; - cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), - rq_io_start_time_ns(rq), rq->cmd_flags); + cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, + rq_start_time_ns(rq), rq_io_start_time_ns(rq), + rq_data_dir(rq), rq_is_sync(rq)); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; @@ -3728,7 +3399,7 @@ static int cfq_may_queue(struct request_queue *q, int rw) cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); if (cfqq) { - cfq_init_prio_data(cfqq, cic); + cfq_init_prio_data(cfqq, cic->icq.ioc); return __cfq_may_queue(cfqq); } @@ -3750,7 +3421,7 @@ static void cfq_put_request(struct request *rq) cfqq->allocated[rw]--; /* Put down rq reference on cfqg */ - cfqg_put(RQ_CFQG(rq)); + cfq_put_cfqg(RQ_CFQG(rq)); rq->elv.priv[0] = NULL; rq->elv.priv[1] = NULL; @@ -3794,25 +3465,32 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) * Allocate cfq data structures associated with this request. */ static int -cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, - gfp_t gfp_mask) +cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); const int rw = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; + unsigned int changed; might_sleep_if(gfp_mask & __GFP_WAIT); spin_lock_irq(q->queue_lock); - check_ioprio_changed(cic, bio); - check_blkcg_changed(cic, bio); + /* handle changed notifications */ + changed = icq_get_changed(&cic->icq); + if (unlikely(changed & ICQ_IOPRIO_CHANGED)) + changed_ioprio(cic); +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (unlikely(changed & ICQ_CGROUP_CHANGED)) + changed_cgroup(cic); +#endif + new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); } else { /* @@ -3838,9 +3516,8 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, cfqq->allocated[rw]++; cfqq->ref++; - cfqg_get(cfqq->cfqg); rq->elv.priv[0] = cfqq; - rq->elv.priv[1] = cfqq->cfqg; + rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg); spin_unlock_irq(q->queue_lock); return 0; } @@ -3937,6 +3614,7 @@ static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; struct request_queue *q = cfqd->queue; + bool wait = false; cfq_shutdown_timer_wq(cfqd); @@ -3946,52 +3624,89 @@ static void cfq_exit_queue(struct elevator_queue *e) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); cfq_put_async_queues(cfqd); + cfq_release_cfq_groups(cfqd); + + /* + * If there are groups which we could not unlink from blkcg list, + * wait for a rcu period for them to be freed. + */ + if (cfqd->nr_blkcg_linked_grps) + wait = true; spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); -#ifndef CONFIG_CFQ_GROUP_IOSCHED - kfree(cfqd->root_group); + /* + * Wait for cfqg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other unlinked groups out + * there. This can happen if cgroup deletion path claimed the + * responsibility of cleaning up a group before queue cleanup code + * get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* Free up per cpu stats for root group */ + free_percpu(cfqd->root_group.blkg.stats_cpu); #endif - blkcg_deactivate_policy(q, &blkcg_policy_cfq); kfree(cfqd); } -static int cfq_init_queue(struct request_queue *q) +static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; - struct blkcg_gq *blkg __maybe_unused; - int i, ret; + int i, j; + struct cfq_group *cfqg; + struct cfq_rb_root *st; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) - return -ENOMEM; - - cfqd->queue = q; - q->elevator->elevator_data = cfqd; + return NULL; /* Init root service tree */ cfqd->grp_service_tree = CFQ_RB_ROOT; - /* Init root group and prefer root group over other groups by default */ + /* Init root group */ + cfqg = &cfqd->root_group; + for_each_cfqg_st(cfqg, i, j, st) + *st = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + /* Give preference to root group over other groups */ + cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; + #ifdef CONFIG_CFQ_GROUP_IOSCHED - ret = blkcg_activate_policy(q, &blkcg_policy_cfq); - if (ret) - goto out_free; + /* + * Set root group reference to 2. One reference will be dropped when + * all groups on cfqd->cfqg_list are being deleted during queue exit. + * Other reference will remain there as we don't want to delete this + * group as it is statically allocated and gets destroyed when + * throtl_data goes away. + */ + cfqg->ref = 2; - cfqd->root_group = blkg_to_cfqg(q->root_blkg); -#else - ret = -ENOMEM; - cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), - GFP_KERNEL, cfqd->queue->node); - if (!cfqd->root_group) - goto out_free; + if (blkio_alloc_blkg_stats(&cfqg->blkg)) { + kfree(cfqg); + kfree(cfqd); + return NULL; + } - cfq_init_cfqg_base(cfqd->root_group); -#endif - cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; + rcu_read_lock(); + + cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, + (void *)cfqd, 0); + rcu_read_unlock(); + cfqd->nr_blkcg_linked_grps++; + /* Add group on cfqd->cfqg_list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); +#endif /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides @@ -4003,17 +3718,13 @@ static int cfq_init_queue(struct request_queue *q) /* * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. oom_cfqq is linked to root_group - * but shouldn't hold a reference as it'll never be unlinked. Lose - * the reference from linking right away. + * will not attempt to free it. */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); cfqd->oom_cfqq.ref++; + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); - spin_lock_irq(q->queue_lock); - cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); - cfqg_put(cfqd->root_group); - spin_unlock_irq(q->queue_lock); + cfqd->queue = q; init_timer(&cfqd->idle_slice_timer); cfqd->idle_slice_timer.function = cfq_idle_slice_timer; @@ -4039,11 +3750,7 @@ static int cfq_init_queue(struct request_queue *q) * second, in order to have larger depth for async operations. */ cfqd->last_delayed_sync = jiffies - HZ; - return 0; - -out_free: - kfree(cfqd); - return ret; + return cfqd; } /* @@ -4170,13 +3877,15 @@ static struct elevator_type iosched_cfq = { }; #ifdef CONFIG_CFQ_GROUP_IOSCHED -static struct blkcg_policy blkcg_policy_cfq = { - .pd_size = sizeof(struct cfq_group), - .cftypes = cfq_blkcg_files, - - .pd_init_fn = cfq_pd_init, - .pd_reset_stats_fn = cfq_pd_reset_stats, +static struct blkio_policy_type blkio_policy_cfq = { + .ops = { + .blkio_unlink_group_fn = cfq_unlink_blkio_group, + .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, + }, + .plid = BLKIO_POLICY_PROP, }; +#else +static struct blkio_policy_type blkio_policy_cfq; #endif static int __init cfq_init(void) @@ -4197,31 +3906,24 @@ static int __init cfq_init(void) #else cfq_group_idle = 0; #endif - - ret = blkcg_policy_register(&blkcg_policy_cfq); - if (ret) - return ret; - cfq_pool = KMEM_CACHE(cfq_queue, 0); if (!cfq_pool) - goto err_pol_unreg; + return -ENOMEM; ret = elv_register(&iosched_cfq); - if (ret) - goto err_free_pool; + if (ret) { + kmem_cache_destroy(cfq_pool); + return ret; + } - return 0; + blkio_policy_register(&blkio_policy_cfq); -err_free_pool: - kmem_cache_destroy(cfq_pool); -err_pol_unreg: - blkcg_policy_unregister(&blkcg_policy_cfq); - return ret; + return 0; } static void __exit cfq_exit(void) { - blkcg_policy_unregister(&blkcg_policy_cfq); + blkio_policy_unregister(&blkio_policy_cfq); elv_unregister(&iosched_cfq); kmem_cache_destroy(cfq_pool); } diff --git a/trunk/block/cfq.h b/trunk/block/cfq.h new file mode 100644 index 000000000000..2a155927e37c --- /dev/null +++ b/trunk/block/cfq.h @@ -0,0 +1,115 @@ +#ifndef _CFQ_H +#define _CFQ_H +#include "blk-cgroup.h" + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) +{ + blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkiocg_update_dequeue_stats(blkg, dequeue); +} + +static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time, unsigned long unaccounted_time) +{ + blkiocg_update_timeslice_used(blkg, time, unaccounted_time); +} + +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + blkiocg_set_start_empty_time(blkg); +} + +static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + blkiocg_update_io_remove_stats(blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + blkiocg_update_io_merged_stats(blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + blkiocg_update_idle_time_stats(blkg); +} + +static inline void +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + blkiocg_update_avg_queue_size_stats(blkg); +} + +static inline void +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{ + blkiocg_update_set_idle_time_stats(blkg); +} + +static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) +{ + blkiocg_update_dispatch_stats(blkg, bytes, direction, sync); +} + +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) +{ + blkiocg_update_completion_stats(blkg, start_time, io_start_time, + direction, sync); +} + +static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) { + blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); +} + +static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + return blkiocg_del_blkio_group(blkg); +} + +#else /* CFQ_GROUP_IOSCHED */ +static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) {} + +static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) {} + +static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time, unsigned long unaccounted_time) {} +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} +static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ +} +static inline void +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {} + +static inline void +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {} + +static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) {} +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {} + +static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) {} +static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + return 0; +} + +#endif /* CFQ_GROUP_IOSCHED */ +#endif diff --git a/trunk/block/deadline-iosched.c b/trunk/block/deadline-iosched.c index 599b12e5380f..7bf12d793fcd 100644 --- a/trunk/block/deadline-iosched.c +++ b/trunk/block/deadline-iosched.c @@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static int deadline_init_queue(struct request_queue *q) +static void *deadline_init_queue(struct request_queue *q) { struct deadline_data *dd; dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); if (!dd) - return -ENOMEM; + return NULL; INIT_LIST_HEAD(&dd->fifo_list[READ]); INIT_LIST_HEAD(&dd->fifo_list[WRITE]); @@ -354,9 +354,7 @@ static int deadline_init_queue(struct request_queue *q) dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; - - q->elevator->elevator_data = dd; - return 0; + return dd; } /* diff --git a/trunk/block/elevator.c b/trunk/block/elevator.c index 6a55d418896f..f016855a46b0 100644 --- a/trunk/block/elevator.c +++ b/trunk/block/elevator.c @@ -38,7 +38,6 @@ #include #include "blk.h" -#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -122,6 +121,15 @@ static struct elevator_type *elevator_get(const char *name) return e; } +static int elevator_init_queue(struct request_queue *q, + struct elevator_queue *eq) +{ + eq->elevator_data = eq->type->ops.elevator_init_fn(q); + if (eq->elevator_data) + return 0; + return -ENOMEM; +} + static char chosen_elevator[ELV_NAME_MAX]; static int __init elevator_setup(char *str) @@ -180,6 +188,7 @@ static void elevator_release(struct kobject *kobj) int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; + struct elevator_queue *eq; int err; if (unlikely(q->elevator)) @@ -213,16 +222,17 @@ int elevator_init(struct request_queue *q, char *name) } } - q->elevator = elevator_alloc(q, e); - if (!q->elevator) + eq = elevator_alloc(q, e); + if (!eq) return -ENOMEM; - err = e->ops.elevator_init_fn(q); + err = elevator_init_queue(q, eq); if (err) { - kobject_put(&q->elevator->kobj); + kobject_put(&eq->kobj); return err; } + q->elevator = eq; return 0; } EXPORT_SYMBOL(elevator_init); @@ -554,6 +564,25 @@ void elv_drain_elevator(struct request_queue *q) } } +void elv_quiesce_start(struct request_queue *q) +{ + if (!q->elevator) + return; + + spin_lock_irq(q->queue_lock); + queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); + spin_unlock_irq(q->queue_lock); + + blk_drain_queue(q, false); +} + +void elv_quiesce_end(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); + spin_unlock_irq(q->queue_lock); +} + void __elv_add_request(struct request_queue *q, struct request *rq, int where) { trace_block_rq_insert(q, rq); @@ -663,13 +692,12 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) return NULL; } -int elv_set_request(struct request_queue *q, struct request *rq, - struct bio *bio, gfp_t gfp_mask) +int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) { struct elevator_queue *e = q->elevator; if (e->type->ops.elevator_set_req_fn) - return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); + return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask); return 0; } @@ -773,9 +801,8 @@ static struct kobj_type elv_ktype = { .release = elevator_release, }; -int elv_register_queue(struct request_queue *q) +int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) { - struct elevator_queue *e = q->elevator; int error; error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); @@ -793,6 +820,11 @@ int elv_register_queue(struct request_queue *q) } return error; } + +int elv_register_queue(struct request_queue *q) +{ + return __elv_register_queue(q, q->elevator); +} EXPORT_SYMBOL(elv_register_queue); void elv_unregister_queue(struct request_queue *q) @@ -875,60 +907,53 @@ EXPORT_SYMBOL_GPL(elv_unregister); */ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - struct elevator_queue *old = q->elevator; - bool registered = old->registered; + struct elevator_queue *old_elevator, *e; int err; - /* - * Turn on BYPASS and drain all requests w/ elevator private data. - * Block layer doesn't call into a quiesced elevator - all requests - * are directly put on the dispatch list without elevator data - * using INSERT_BACK. All requests have SOFTBARRIER set and no - * merge happens either. - */ - blk_queue_bypass_start(q); - - /* unregister and clear all auxiliary data of the old elevator */ - if (registered) - elv_unregister_queue(q); - - spin_lock_irq(q->queue_lock); - ioc_clear_queue(q); - spin_unlock_irq(q->queue_lock); - - /* allocate, init and register new elevator */ - err = -ENOMEM; - q->elevator = elevator_alloc(q, new_e); - if (!q->elevator) - goto fail_init; + /* allocate new elevator */ + e = elevator_alloc(q, new_e); + if (!e) + return -ENOMEM; - err = new_e->ops.elevator_init_fn(q); + err = elevator_init_queue(q, e); if (err) { - kobject_put(&q->elevator->kobj); - goto fail_init; + kobject_put(&e->kobj); + return err; } - if (registered) { - err = elv_register_queue(q); + /* turn on BYPASS and drain all requests w/ elevator private data */ + elv_quiesce_start(q); + + /* unregister old queue, register new one and kill old elevator */ + if (q->elevator->registered) { + elv_unregister_queue(q); + err = __elv_register_queue(q, e); if (err) goto fail_register; } - /* done, kill the old one and finish */ - elevator_exit(old); - blk_queue_bypass_end(q); + /* done, clear io_cq's, switch elevators and turn off BYPASS */ + spin_lock_irq(q->queue_lock); + ioc_clear_queue(q); + old_elevator = q->elevator; + q->elevator = e; + spin_unlock_irq(q->queue_lock); + + elevator_exit(old_elevator); + elv_quiesce_end(q); - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name); return 0; fail_register: - elevator_exit(q->elevator); -fail_init: - /* switch failed, restore and re-register old elevator */ - q->elevator = old; + /* + * switch failed, exit the new io scheduler and reattach the old + * one again (along with re-adding the sysfs dir) + */ + elevator_exit(e); elv_register_queue(q); - blk_queue_bypass_end(q); + elv_quiesce_end(q); return err; } diff --git a/trunk/block/noop-iosched.c b/trunk/block/noop-iosched.c index 5d1bf70e33d5..413a0b1d788c 100644 --- a/trunk/block/noop-iosched.c +++ b/trunk/block/noop-iosched.c @@ -59,17 +59,15 @@ noop_latter_request(struct request_queue *q, struct request *rq) return list_entry(rq->queuelist.next, struct request, queuelist); } -static int noop_init_queue(struct request_queue *q) +static void *noop_init_queue(struct request_queue *q) { struct noop_data *nd; nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); if (!nd) - return -ENOMEM; - + return NULL; INIT_LIST_HEAD(&nd->queue); - q->elevator->elevator_data = nd; - return 0; + return nd; } static void noop_exit_queue(struct elevator_queue *e) diff --git a/trunk/drivers/block/drbd/drbd_actlog.c b/trunk/drivers/block/drbd/drbd_actlog.c index e54e31b02b88..cf0e63dd97da 100644 --- a/trunk/drivers/block/drbd/drbd_actlog.c +++ b/trunk/drivers/block/drbd/drbd_actlog.c @@ -65,80 +65,39 @@ struct drbd_atodb_wait { int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); -void *drbd_md_get_buffer(struct drbd_conf *mdev) -{ - int r; - - wait_event(mdev->misc_wait, - (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 || - mdev->state.disk <= D_FAILED); - - return r ? NULL : page_address(mdev->md_io_page); -} - -void drbd_md_put_buffer(struct drbd_conf *mdev) -{ - if (atomic_dec_and_test(&mdev->md_io_in_use)) - wake_up(&mdev->misc_wait); -} - -static bool md_io_allowed(struct drbd_conf *mdev) -{ - enum drbd_disk_state ds = mdev->state.disk; - return ds >= D_NEGOTIATING || ds == D_ATTACHING; -} - -void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, - unsigned int *done) -{ - long dt = bdev->dc.disk_timeout * HZ / 10; - if (dt == 0) - dt = MAX_SCHEDULE_TIMEOUT; - - dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt); - if (dt == 0) - dev_err(DEV, "meta-data IO operation timed out\n"); -} - static int _drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, struct page *page, sector_t sector, int rw, int size) { struct bio *bio; + struct drbd_md_io md_io; int ok; - mdev->md_io.done = 0; - mdev->md_io.error = -ENODEV; + md_io.mdev = mdev; + init_completion(&md_io.event); + md_io.error = 0; if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) rw |= REQ_FUA | REQ_FLUSH; rw |= REQ_SYNC; - bio = bio_alloc_drbd(GFP_NOIO); + bio = bio_alloc(GFP_NOIO, 1); bio->bi_bdev = bdev->md_bdev; bio->bi_sector = sector; ok = (bio_add_page(bio, page, size, 0) == size); if (!ok) goto out; - bio->bi_private = &mdev->md_io; + bio->bi_private = &md_io; bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; - if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ - dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); - ok = 0; - goto out; - } - - bio_get(bio); /* one bio_put() is in the completion handler */ - atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done); - ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; + wait_for_completion(&md_io.event); + ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; out: bio_put(bio); @@ -152,7 +111,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int offset = 0; struct page *iop = mdev->md_io_page; - D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); + D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); BUG_ON(!bdev->md_bdev); @@ -369,13 +328,8 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) return 1; } - buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ - if (!buffer) { - dev_err(DEV, "disk failed while waiting for md_io buffer\n"); - complete(&((struct update_al_work *)w)->event); - put_ldev(mdev); - return 1; - } + mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ + buffer = (struct al_transaction *)page_address(mdev->md_io_page); buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); buffer->tr_number = cpu_to_be32(mdev->al_tr_number); @@ -420,7 +374,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); mdev->al_tr_number++; - drbd_md_put_buffer(mdev); + mutex_unlock(&mdev->md_io_mutex); complete(&((struct update_al_work *)w)->event); put_ldev(mdev); @@ -489,9 +443,8 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) /* lock out all other meta data io for now, * and make sure the page is mapped. */ - buffer = drbd_md_get_buffer(mdev); - if (!buffer) - return 0; + mutex_lock(&mdev->md_io_mutex); + buffer = page_address(mdev->md_io_page); /* Find the valid transaction in the log */ for (i = 0; i <= mx; i++) { @@ -499,7 +452,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (rv == 0) continue; if (rv == -1) { - drbd_md_put_buffer(mdev); + mutex_unlock(&mdev->md_io_mutex); return 0; } cnr = be32_to_cpu(buffer->tr_number); @@ -525,7 +478,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!found_valid) { dev_warn(DEV, "No usable activity log found.\n"); - drbd_md_put_buffer(mdev); + mutex_unlock(&mdev->md_io_mutex); return 1; } @@ -540,7 +493,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = drbd_al_read_tr(mdev, bdev, buffer, i); ERR_IF(rv == 0) goto cancel; if (rv == -1) { - drbd_md_put_buffer(mdev); + mutex_unlock(&mdev->md_io_mutex); return 0; } @@ -581,7 +534,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) mdev->al_tr_pos = 0; /* ok, we are done with it */ - drbd_md_put_buffer(mdev); + mutex_unlock(&mdev->md_io_mutex); dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", transactions, active_extents); @@ -718,20 +671,16 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, else ext->rs_failed += count; if (ext->rs_left < ext->rs_failed) { - dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d " - "rs_failed=%d count=%d cstate=%s\n", + dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " + "rs_failed=%d count=%d\n", (unsigned long long)sector, ext->lce.lc_number, ext->rs_left, - ext->rs_failed, count, - drbd_conn_str(mdev->state.conn)); - - /* We don't expect to be able to clear more bits - * than have been set when we originally counted - * the set bits to cache that value in ext->rs_left. - * Whatever the reason (disconnect during resync, - * delayed local completion of an application write), - * try to fix it up by recounting here. */ - ext->rs_left = drbd_bm_e_weight(mdev, enr); + ext->rs_failed, count); + dump_stack(); + + lc_put(mdev->resync, &ext->lce); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return; } } else { /* Normally this element should be in the cache, @@ -1243,7 +1192,6 @@ int drbd_rs_del_all(struct drbd_conf *mdev) put_ldev(mdev); } spin_unlock_irq(&mdev->al_lock); - wake_up(&mdev->al_wait); return 0; } diff --git a/trunk/drivers/block/drbd/drbd_bitmap.c b/trunk/drivers/block/drbd/drbd_bitmap.c index b5c5ff53cb57..3030201c69d8 100644 --- a/trunk/drivers/block/drbd/drbd_bitmap.c +++ b/trunk/drivers/block/drbd/drbd_bitmap.c @@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) static void bm_store_page_idx(struct page *page, unsigned long idx) { BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); - set_page_private(page, idx); + page_private(page) |= idx; } static unsigned long bm_page_to_idx(struct page *page) @@ -886,21 +886,12 @@ void drbd_bm_clear_all(struct drbd_conf *mdev) struct bm_aio_ctx { struct drbd_conf *mdev; atomic_t in_flight; - unsigned int done; + struct completion done; unsigned flags; #define BM_AIO_COPY_PAGES 1 int error; - struct kref kref; }; -static void bm_aio_ctx_destroy(struct kref *kref) -{ - struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); - - put_ldev(ctx->mdev); - kfree(ctx); -} - /* bv_page may be a copy, or may be the original */ static void bm_async_io_complete(struct bio *bio, int error) { @@ -939,21 +930,20 @@ static void bm_async_io_complete(struct bio *bio, int error) bm_page_unlock_io(mdev, idx); + /* FIXME give back to page pool */ if (ctx->flags & BM_AIO_COPY_PAGES) - mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); + put_page(bio->bi_io_vec[0].bv_page); bio_put(bio); - if (atomic_dec_and_test(&ctx->in_flight)) { - ctx->done = 1; - wake_up(&mdev->misc_wait); - kref_put(&ctx->kref, &bm_aio_ctx_destroy); - } + if (atomic_dec_and_test(&ctx->in_flight)) + complete(&ctx->done); } static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) { - struct bio *bio = bio_alloc_drbd(GFP_NOIO); + /* we are process context. we always get a bio */ + struct bio *bio = bio_alloc(GFP_KERNEL, 1); struct drbd_conf *mdev = ctx->mdev; struct drbd_bitmap *b = mdev->bitmap; struct page *page; @@ -976,8 +966,10 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { + /* FIXME alloc_page is good enough for now, but actually needs + * to use pre-allocated page pool */ void *src, *dest; - page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT); dest = kmap_atomic(page); src = kmap_atomic(b->bm_pages[page_nr]); memcpy(dest, src, PAGE_SIZE); @@ -989,8 +981,6 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bio->bi_bdev = mdev->ldev->md_bdev; bio->bi_sector = on_disk_sector; - /* bio_add_page of a single page to an empty bio will always succeed, - * according to api. Do we want to assert that? */ bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = bm_async_io_complete; @@ -1009,9 +999,14 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ -static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) +static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) { - struct bm_aio_ctx *ctx; + struct bm_aio_ctx ctx = { + .mdev = mdev, + .in_flight = ATOMIC_INIT(1), + .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), + .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0, + }; struct drbd_bitmap *b = mdev->bitmap; int num_pages, i, count = 0; unsigned long now; @@ -1026,27 +1021,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w * For lazy writeout, we don't care for ongoing changes to the bitmap, * as we submit copies of pages anyways. */ - - ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); - if (!ctx) - return -ENOMEM; - - *ctx = (struct bm_aio_ctx) { - .mdev = mdev, - .in_flight = ATOMIC_INIT(1), - .done = 0, - .flags = flags, - .error = 0, - .kref = { ATOMIC_INIT(2) }, - }; - - if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ - dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); - kfree(ctx); - return -ENODEV; - } - - if (!ctx->flags) + if (!ctx.flags) WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); num_pages = b->bm_number_of_pages; @@ -1071,38 +1046,29 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w continue; } } - atomic_inc(&ctx->in_flight); - bm_page_io_async(ctx, i, rw); + atomic_inc(&ctx.in_flight); + bm_page_io_async(&ctx, i, rw); ++count; cond_resched(); } /* - * We initialize ctx->in_flight to one to make sure bm_async_io_complete - * will not set ctx->done early, and decrement / test it here. If there + * We initialize ctx.in_flight to one to make sure bm_async_io_complete + * will not complete() early, and decrement / test it here. If there * are still some bios in flight, we need to wait for them here. - * If all IO is done already (or nothing had been submitted), there is - * no need to wait. Still, we need to put the kref associated with the - * "in_flight reached zero, all done" event. */ - if (!atomic_dec_and_test(&ctx->in_flight)) - wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); - else - kref_put(&ctx->kref, &bm_aio_ctx_destroy); - + if (!atomic_dec_and_test(&ctx.in_flight)) + wait_for_completion(&ctx.done); dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", rw == WRITE ? "WRITE" : "READ", count, jiffies - now); - if (ctx->error) { + if (ctx.error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); drbd_chk_io_error(mdev, 1, true); - err = -EIO; /* ctx->error ? */ + err = -EIO; /* ctx.error ? */ } - if (atomic_read(&ctx->in_flight)) - err = -EIO; /* Disk failed during IO... */ - now = jiffies; if (rw == WRITE) { drbd_md_flush(mdev); @@ -1116,7 +1082,6 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); - kref_put(&ctx->kref, &bm_aio_ctx_destroy); return err; } @@ -1126,7 +1091,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w */ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, READ, 0, 0); + return bm_rw(mdev, READ, 0); } /** @@ -1137,7 +1102,7 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, WRITE, 0, 0); + return bm_rw(mdev, WRITE, 0); } /** @@ -1147,23 +1112,7 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) { - return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx); -} - -/** - * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. - * @mdev: DRBD device. - * - * Will only write pages that have changed since last IO. - * In contrast to drbd_bm_write(), this will copy the bitmap pages - * to temporary writeout pages. It is intended to trigger a full write-out - * while still allowing the bitmap to change, for example if a resync or online - * verify is aborted due to a failed peer disk, while local IO continues, or - * pending resync acks are still being processed. - */ -int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) -{ - return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); + return bm_rw(mdev, WRITE, upper_idx); } @@ -1181,45 +1130,28 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) { - struct bm_aio_ctx *ctx; - int err; - - if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { - dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); - return 0; - } - - ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); - if (!ctx) - return -ENOMEM; - - *ctx = (struct bm_aio_ctx) { + struct bm_aio_ctx ctx = { .mdev = mdev, .in_flight = ATOMIC_INIT(1), - .done = 0, + .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), .flags = BM_AIO_COPY_PAGES, - .error = 0, - .kref = { ATOMIC_INIT(2) }, }; - if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ - dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n"); - kfree(ctx); - return -ENODEV; + if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { + dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); + return 0; } - bm_page_io_async(ctx, idx, WRITE_SYNC); - wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); + bm_page_io_async(&ctx, idx, WRITE_SYNC); + wait_for_completion(&ctx.done); - if (ctx->error) + if (ctx.error) drbd_chk_io_error(mdev, 1, true); /* that should force detach, so the in memory bitmap will be * gone in a moment as well. */ mdev->bm_writ_cnt++; - err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error; - kref_put(&ctx->kref, &bm_aio_ctx_destroy); - return err; + return ctx.error; } /* NOTE diff --git a/trunk/drivers/block/drbd/drbd_int.h b/trunk/drivers/block/drbd/drbd_int.h index 02f013a073a7..8d680562ba73 100644 --- a/trunk/drivers/block/drbd/drbd_int.h +++ b/trunk/drivers/block/drbd/drbd_int.h @@ -712,6 +712,7 @@ struct drbd_request { struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */ unsigned long rq_state; /* see comments above _req_mod() */ + int seq_num; unsigned long start_time; }; @@ -850,7 +851,6 @@ enum { NEW_CUR_UUID, /* Create new current UUID when thawing IO */ AL_SUSPENDED, /* Activity logging is currently suspended. */ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ - STATE_SENT, /* Do not change state/UUIDs while this is set */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -862,30 +862,31 @@ enum bm_flag { BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ /* currently locked for bulk operation */ - BM_LOCKED_MASK = 0xf, + BM_LOCKED_MASK = 0x7, /* in detail, that is: */ BM_DONT_CLEAR = 0x1, BM_DONT_SET = 0x2, BM_DONT_TEST = 0x4, - /* so we can mark it locked for bulk operation, - * and still allow all non-bulk operations */ - BM_IS_LOCKED = 0x8, - /* (test bit, count bit) allowed (common case) */ - BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED, + BM_LOCKED_TEST_ALLOWED = 0x3, /* testing bits, as well as setting new bits allowed, but clearing bits * would be unexpected. Used during bitmap receive. Setting new bits * requires sending of "out-of-sync" information, though. */ - BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED, + BM_LOCKED_SET_ALLOWED = 0x1, - /* for drbd_bm_write_copy_pages, everything is allowed, - * only concurrent bulk operations are locked out. */ - BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED, + /* clear is not expected while bitmap is locked for bulk operation */ }; + +/* TODO sort members for performance + * MAYBE group them further */ + +/* THINK maybe we actually want to use the default "event/%s" worker threads + * or similar in linux 2.6, which uses per cpu data and threads. + */ struct drbd_work_queue { struct list_head q; struct semaphore s; /* producers up it, worker down()s it */ @@ -937,7 +938,8 @@ struct drbd_backing_dev { }; struct drbd_md_io { - unsigned int done; + struct drbd_conf *mdev; + struct completion event; int error; }; @@ -1020,7 +1022,6 @@ struct drbd_conf { struct drbd_tl_epoch *newest_tle; struct drbd_tl_epoch *oldest_tle; struct list_head out_of_sequence_requests; - struct list_head barrier_acked_requests; struct hlist_head *tl_hash; unsigned int tl_hash_s; @@ -1055,8 +1056,6 @@ struct drbd_conf { struct crypto_hash *csums_tfm; struct crypto_hash *verify_tfm; - unsigned long last_reattach_jif; - unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; struct drbd_thread asender; @@ -1095,8 +1094,7 @@ struct drbd_conf { wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ - struct drbd_md_io md_io; - atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ + struct mutex md_io_mutex; /* protects the md_io_buffer */ spinlock_t al_lock; wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ @@ -1230,8 +1228,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev); extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); -extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); -extern int drbd_send_current_state(struct drbd_conf *mdev); +extern int _drbd_send_state(struct drbd_conf *mdev); +extern int drbd_send_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags); @@ -1463,7 +1461,6 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); -extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr); extern size_t drbd_bm_words(struct drbd_conf *mdev); @@ -1496,38 +1493,11 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t *drbd_request_mempool; extern mempool_t *drbd_ee_mempool; -/* drbd's page pool, used to buffer data received from the peer, - * or data requested by the peer. - * - * This does not have an emergency reserve. - * - * When allocating from this pool, it first takes pages from the pool. - * Only if the pool is depleted will try to allocate from the system. - * - * The assumption is that pages taken from this pool will be processed, - * and given back, "quickly", and then can be recycled, so we can avoid - * frequent calls to alloc_page(), and still will be able to make progress even - * under memory pressure. - */ -extern struct page *drbd_pp_pool; +extern struct page *drbd_pp_pool; /* drbd's page pool */ extern spinlock_t drbd_pp_lock; extern int drbd_pp_vacant; extern wait_queue_head_t drbd_pp_wait; -/* We also need a standard (emergency-reserve backed) page pool - * for meta data IO (activity log, bitmap). - * We can keep it global, as long as it is used as "N pages at a time". - * 128 should be plenty, currently we probably can get away with as few as 1. - */ -#define DRBD_MIN_POOL_PAGES 128 -extern mempool_t *drbd_md_io_page_pool; - -/* We also need to make sure we get a bio - * when we need it for housekeeping purposes */ -extern struct bio_set *drbd_md_io_bio_set; -/* to allocate from that set */ -extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); - extern rwlock_t global_state_lock; extern struct drbd_conf *drbd_new_device(unsigned int minor); @@ -1566,12 +1536,8 @@ extern void resume_next_sg(struct drbd_conf *mdev); extern void suspend_other_sg(struct drbd_conf *mdev); extern int drbd_resync_finished(struct drbd_conf *mdev); /* maybe rather drbd_main.c ? */ -extern void *drbd_md_get_buffer(struct drbd_conf *mdev); -extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev, sector_t sector, int rw); -extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, - unsigned int *done); + struct drbd_backing_dev *bdev, sector_t sector, int rw); extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); extern void drbd_rs_controller_reset(struct drbd_conf *mdev); @@ -1788,6 +1754,19 @@ static inline struct page *page_chain_next(struct page *page) #define page_chain_for_each_safe(page, n) \ for (; page && ({ n = page_chain_next(page); 1; }); page = n) +static inline int drbd_bio_has_active_page(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + __bio_for_each_segment(bvec, bio, i, 0) { + if (page_count(bvec->bv_page) > 1) + return 1; + } + + return 0; +} + static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) { struct page *page = e->pages; @@ -1798,6 +1777,7 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) return 0; } + static inline void drbd_state_lock(struct drbd_conf *mdev) { wait_event(mdev->misc_wait, @@ -2250,7 +2230,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, * Note: currently we don't support such large bitmaps on 32bit * arch anyways, but no harm done to be prepared for it here. */ - unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10; + unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10; unsigned long left = *bits_left >> shift; unsigned long total = 1UL + (mdev->rs_total >> shift); unsigned long tmp = 1000UL - left * 1000UL/total; @@ -2326,12 +2306,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) case D_OUTDATED: case D_CONSISTENT: case D_UP_TO_DATE: - case D_FAILED: /* disk state is stable as well. */ break; /* no new io accepted during tansitional states */ case D_ATTACHING: + case D_FAILED: case D_NEGOTIATING: case D_UNKNOWN: case D_MASK: diff --git a/trunk/drivers/block/drbd/drbd_main.c b/trunk/drivers/block/drbd/drbd_main.c index 920ede2829d6..211fc44f84be 100644 --- a/trunk/drivers/block/drbd/drbd_main.c +++ b/trunk/drivers/block/drbd/drbd_main.c @@ -139,8 +139,6 @@ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; -mempool_t *drbd_md_io_page_pool; -struct bio_set *drbd_md_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -161,24 +159,7 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; -static void bio_destructor_drbd(struct bio *bio) -{ - bio_free(bio, drbd_md_io_bio_set); -} - -struct bio *bio_alloc_drbd(gfp_t gfp_mask) -{ - struct bio *bio; - - if (!drbd_md_io_bio_set) - return bio_alloc(gfp_mask, 1); - - bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); - if (!bio) - return NULL; - bio->bi_destructor = bio_destructor_drbd; - return bio; -} +#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will @@ -227,7 +208,6 @@ static int tl_init(struct drbd_conf *mdev) mdev->oldest_tle = b; mdev->newest_tle = b; INIT_LIST_HEAD(&mdev->out_of_sequence_requests); - INIT_LIST_HEAD(&mdev->barrier_acked_requests); mdev->tl_hash = NULL; mdev->tl_hash_s = 0; @@ -266,7 +246,9 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) new->n_writes = 0; newest_before = mdev->newest_tle; - new->br_number = newest_before->br_number+1; + /* never send a barrier number == 0, because that is special-cased + * when using TCQ for our write ordering code */ + new->br_number = (newest_before->br_number+1) ?: 1; if (mdev->newest_tle != new) { mdev->newest_tle->next = new; mdev->newest_tle = new; @@ -329,7 +311,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, These have been list_move'd to the out_of_sequence_requests list in _req_mod(, barrier_acked) above. */ - list_splice_init(&b->requests, &mdev->barrier_acked_requests); + list_del_init(&b->requests); nob = b->next; if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { @@ -429,23 +411,6 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) b = tmp; list_splice(&carry_reads, &b->requests); } - - /* Actions operating on the disk state, also want to work on - requests that got barrier acked. */ - switch (what) { - case fail_frozen_disk_io: - case restart_frozen_disk_io: - list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { - req = list_entry(le, struct drbd_request, tl_requests); - _req_mod(req, what); - } - - case connection_lost_while_pending: - case resend: - break; - default: - dev_err(DEV, "what = %d in _tl_restart()\n", what); - } } @@ -492,38 +457,6 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) spin_unlock_irq(&mdev->req_lock); } -/** - * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL - * @mdev: DRBD device. - */ -void tl_abort_disk_io(struct drbd_conf *mdev) -{ - struct drbd_tl_epoch *b; - struct list_head *le, *tle; - struct drbd_request *req; - - spin_lock_irq(&mdev->req_lock); - b = mdev->oldest_tle; - while (b) { - list_for_each_safe(le, tle, &b->requests) { - req = list_entry(le, struct drbd_request, tl_requests); - if (!(req->rq_state & RQ_LOCAL_PENDING)) - continue; - _req_mod(req, abort_disk_io); - } - b = b->next; - } - - list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { - req = list_entry(le, struct drbd_request, tl_requests); - if (!(req->rq_state & RQ_LOCAL_PENDING)) - continue; - _req_mod(req, abort_disk_io); - } - - spin_unlock_irq(&mdev->req_lock); -} - /** * cl_wide_st_chg() - true if the state change is a cluster wide one * @mdev: DRBD device. @@ -537,7 +470,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev, ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_FAILED && ns.disk == D_FAILED))) || + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); } @@ -576,16 +509,8 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, union drbd_state, union drbd_state); -enum sanitize_state_warnings { - NO_WARNING, - ABORTED_ONLINE_VERIFY, - ABORTED_RESYNC, - CONNECTION_LOST_NEGOTIATING, - IMPLICITLY_UPGRADED_DISK, - IMPLICITLY_UPGRADED_PDSK, -}; static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum sanitize_state_warnings *warn); + union drbd_state ns, const char **warn_sync_abort); int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); @@ -860,13 +785,6 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) rv = SS_IN_TRANSIENT_STATE; - /* While establishing a connection only allow cstate to change. - Delay/refuse role changes, detach attach etc... */ - if (test_bit(STATE_SENT, &mdev->flags) && - !(os.conn == C_WF_REPORT_PARAMS || - (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) - rv = SS_IN_TRANSIENT_STATE; - if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) rv = SS_NEED_CONNECTION; @@ -885,21 +803,6 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, return rv; } -static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) -{ - static const char *msg_table[] = { - [NO_WARNING] = "", - [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", - [ABORTED_RESYNC] = "Resync aborted.", - [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", - [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", - [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", - }; - - if (warn != NO_WARNING) - dev_warn(DEV, "%s\n", msg_table[warn]); -} - /** * sanitize_state() - Resolves implicitly necessary additional changes to a state transition * @mdev: DRBD device. @@ -911,14 +814,11 @@ static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_ * to D_UNKNOWN. This rule and many more along those lines are in this function. */ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum sanitize_state_warnings *warn) + union drbd_state ns, const char **warn_sync_abort) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; - if (warn) - *warn = NO_WARNING; - fp = FP_DONT_CARE; if (get_ldev(mdev)) { fp = mdev->ldev->dc.fencing; @@ -933,13 +833,18 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. * If you try to go into some Sync* state, that shall fail (elsewhere). */ if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) ns.conn = os.conn; /* we cannot fail (again) if we already detached */ if (ns.disk == D_FAILED && os.disk == D_DISKLESS) ns.disk = D_DISKLESS; + /* if we are only D_ATTACHING yet, + * we can (and should) go directly to D_DISKLESS. */ + if (ns.disk == D_FAILED && os.disk == D_ATTACHING) + ns.disk = D_DISKLESS; + /* After C_DISCONNECTING only C_STANDALONE may follow */ if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) ns.conn = os.conn; @@ -958,9 +863,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* Abort resync if a disk fails/detaches */ if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn) - *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? - ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; + if (warn_sync_abort) + *warn_sync_abort = + os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? + "Online-verify" : "Resync"; ns.conn = C_CONNECTED; } @@ -971,8 +877,7 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = mdev->new_state_tmp.disk; ns.pdsk = mdev->new_state_tmp.pdsk; } else { - if (warn) - *warn = CONNECTION_LOST_NEGOTIATING; + dev_alert(DEV, "Connection lost while negotiating, no data!\n"); ns.disk = D_DISKLESS; ns.pdsk = D_UNKNOWN; } @@ -1054,16 +959,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = disk_max; if (ns.disk < disk_min) { - if (warn) - *warn = IMPLICITLY_UPGRADED_DISK; + dev_warn(DEV, "Implicitly set disk from %s to %s\n", + drbd_disk_str(ns.disk), drbd_disk_str(disk_min)); ns.disk = disk_min; } if (ns.pdsk > pdsk_max) ns.pdsk = pdsk_max; if (ns.pdsk < pdsk_min) { - if (warn) - *warn = IMPLICITLY_UPGRADED_PDSK; + dev_warn(DEV, "Implicitly set pdsk from %s to %s\n", + drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min)); ns.pdsk = pdsk_min; } @@ -1140,12 +1045,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, { union drbd_state os; enum drbd_state_rv rv = SS_SUCCESS; - enum sanitize_state_warnings ssw; + const char *warn_sync_abort = NULL; struct after_state_chg_work *ascw; os = mdev->state; - ns = sanitize_state(mdev, os, ns, &ssw); + ns = sanitize_state(mdev, os, ns, &warn_sync_abort); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1171,7 +1076,8 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, return rv; } - print_sanitize_warnings(mdev, ssw); + if (warn_sync_abort) + dev_warn(DEV, "%s aborted.\n", warn_sync_abort); { char *pbp, pb[300]; @@ -1337,7 +1243,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, drbd_thread_stop_nowait(&mdev->receiver); /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_WF_CONNECTION && + if (os.conn > C_TEAR_DOWN && ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) drbd_thread_restart_nowait(&mdev->receiver); @@ -1345,15 +1251,6 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) drbd_resume_al(mdev); - /* remember last connect and attach times so request_timer_fn() won't - * kill newly established sessions while we are still trying to thaw - * previously frozen IO */ - if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) - mdev->last_reconnect_jif = jiffies; - if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - ns.disk > D_NEGOTIATING) - mdev->last_reattach_jif = jiffies; - ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); if (ascw) { ascw->os = os; @@ -1457,16 +1354,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Here we have the actions that are performed after a state change. This function might sleep */ - if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) - mod_timer(&mdev->request_timer, jiffies + HZ); - nsm.i = -1; if (ns.susp_nod) { if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) what = resend; - if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - ns.disk > D_NEGOTIATING) + if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) what = restart_frozen_disk_io; if (what != nothing) @@ -1515,7 +1408,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ drbd_send_uuids(mdev); - drbd_send_state(mdev, ns); + drbd_send_state(mdev); } /* No point in queuing send_bitmap if we don't have a connection * anymore, so check also the _current_ state, not only the new state @@ -1548,11 +1441,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && - mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) { drbd_uuid_new_current(mdev); drbd_send_uuids(mdev); } + /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) /* We may still be Primary ourselves. @@ -1580,14 +1473,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { drbd_send_sizes(mdev, 0, 0); /* to start sync... */ drbd_send_uuids(mdev); - drbd_send_state(mdev, ns); + drbd_send_state(mdev); } /* We want to pause/continue resync, tell peer. */ if (ns.conn >= C_CONNECTED && ((os.aftr_isp != ns.aftr_isp) || (os.user_isp != ns.user_isp))) - drbd_send_state(mdev, ns); + drbd_send_state(mdev); /* In case one of the isp bits got set, suspend other devices. */ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && @@ -1597,10 +1490,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Make sure the peer gets informed about eventual state changes (ISP bits) while we were in WFReportParams. */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); + drbd_send_state(mdev); if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(mdev, ns); + drbd_send_state(mdev); /* We are in the progress to start a full sync... */ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || @@ -1620,38 +1513,33 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* first half of local IO error, failure to attach, * or administrative detach */ if (os.disk != D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh = EP_PASS_ON; - int was_io_error = 0; + enum drbd_io_error_p eh; + int was_io_error; /* corresponding get_ldev was in __drbd_set_state, to serialize - * our cleanup here with the transition to D_DISKLESS. - * But is is still not save to dreference ldev here, since - * we might come from an failed Attach before ldev was set. */ - if (mdev->ldev) { - eh = mdev->ldev->dc.on_io_error; - was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); - - /* Immediately allow completion of all application IO, that waits - for completion from the local disk. */ - tl_abort_disk_io(mdev); - - /* current state still has to be D_FAILED, - * there is only one way out: to D_DISKLESS, - * and that may only happen after our put_ldev below. */ - if (mdev->state.disk != D_FAILED) - dev_err(DEV, - "ASSERT FAILED: disk is %s during detach\n", - drbd_disk_str(mdev->state.disk)); - - if (ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - - drbd_rs_cancel_all(mdev); - - /* In case we want to get something to stable storage still, - * this may be the last chance. - * Following put_ldev may transition to D_DISKLESS. */ - drbd_md_sync(mdev); - } + * our cleanup here with the transition to D_DISKLESS, + * so it is safe to dreference ldev here. */ + eh = mdev->ldev->dc.on_io_error; + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (mdev->state.disk != D_FAILED) + dev_err(DEV, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(mdev->state.disk)); + + if (drbd_send_state(mdev)) + dev_warn(DEV, "Notified peer that I am detaching my disk\n"); + else + dev_err(DEV, "Sending state for detaching disk failed\n"); + + drbd_rs_cancel_all(mdev); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(mdev); put_ldev(mdev); if (was_io_error && eh == EP_CALL_HELPER) @@ -1673,17 +1561,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, mdev->rs_failed = 0; atomic_set(&mdev->rs_pending_cnt, 0); - if (ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - + if (drbd_send_state(mdev)) + dev_warn(DEV, "Notified peer that I'm now diskless.\n"); /* corresponding get_ldev in __drbd_set_state * this may finally trigger drbd_ldev_destroy. */ put_ldev(mdev); } /* Notify peer that I had a local IO error, and did not detached.. */ - if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT) + drbd_send_state(mdev); /* Disks got bigger while they were detached */ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && @@ -1701,13 +1588,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* sync target done with resync. Explicitly notify peer, even though * it should (at least for non-empty resyncs) already know itself. */ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(mdev, ns); - - /* Wake up role changes, that were delayed because of connection establishing */ - if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { - clear_bit(STATE_SENT, &mdev->flags); - wake_up(&mdev->state_wait); - } + drbd_send_state(mdev); /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk @@ -1717,8 +1598,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, * No harm done if some bits change during this phase. */ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { - drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, - "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); + drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, + "write from resync_finished", BM_LOCKED_SET_ALLOWED); put_ldev(mdev); } @@ -2176,11 +2057,7 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) D_ASSERT(mdev->state.disk == D_UP_TO_DATE); - uuid = mdev->ldev->md.uuid[UI_BITMAP]; - if (uuid && uuid != UUID_JUST_CREATED) - uuid = uuid + UUID_NEW_BM_OFFSET; - else - get_random_bytes(&uuid, sizeof(u64)); + uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET; drbd_uuid_set(mdev, UI_BITMAP, uuid); drbd_print_uuids(mdev, "updated sync UUID"); drbd_md_sync(mdev); @@ -2212,10 +2089,6 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } - /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ - if (mdev->agreed_pro_version <= 94) - max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET); - p.d_size = cpu_to_be64(d_size); p.u_size = cpu_to_be64(u_size); p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); @@ -2229,10 +2102,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl } /** - * drbd_send_current_state() - Sends the drbd state to the peer + * drbd_send_state() - Sends the drbd state to the peer * @mdev: DRBD device. */ -int drbd_send_current_state(struct drbd_conf *mdev) +int drbd_send_state(struct drbd_conf *mdev) { struct socket *sock; struct p_state p; @@ -2258,37 +2131,6 @@ int drbd_send_current_state(struct drbd_conf *mdev) return ok; } -/** - * drbd_send_state() - After a state change, sends the new state to the peer - * @mdev: DRBD device. - * @state: the state to send, not necessarily the current state. - * - * Each state change queues an "after_state_ch" work, which will eventually - * send the resulting new state to the peer. If more state changes happen - * between queuing and processing of the after_state_ch work, we still - * want to send each intermediary state in the order it occurred. - */ -int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) -{ - struct socket *sock; - struct p_state p; - int ok = 0; - - mutex_lock(&mdev->data.mutex); - - p.state = cpu_to_be32(state.i); - sock = mdev->data.socket; - - if (likely(sock != NULL)) { - ok = _drbd_send_cmd(mdev, sock, P_STATE, - (struct p_header80 *)&p, sizeof(p), 0); - } - - mutex_unlock(&mdev->data.mutex); - - return ok; -} - int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) { @@ -2773,7 +2615,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - bio_for_each_segment(bvec, bio, i) { + __bio_for_each_segment(bvec, bio, i, 0) { if (!_drbd_no_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2787,7 +2629,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - bio_for_each_segment(bvec, bio, i) { + __bio_for_each_segment(bvec, bio, i, 0) { if (!_drbd_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2853,7 +2695,8 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) p.sector = cpu_to_be64(req->sector); p.block_id = (unsigned long)req; - p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); + p.seq_num = cpu_to_be32(req->seq_num = + atomic_add_return(1, &mdev->packet_seq)); dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); @@ -3144,8 +2987,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->rs_sect_in, 0); atomic_set(&mdev->rs_sect_ev, 0); atomic_set(&mdev->ap_in_flight, 0); - atomic_set(&mdev->md_io_in_use, 0); + mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); mutex_init(&mdev->meta.mutex); sema_init(&mdev->data.work.s, 0); @@ -3283,10 +3126,6 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ - if (drbd_md_io_bio_set) - bioset_free(drbd_md_io_bio_set); - if (drbd_md_io_page_pool) - mempool_destroy(drbd_md_io_page_pool); if (drbd_ee_mempool) mempool_destroy(drbd_ee_mempool); if (drbd_request_mempool) @@ -3300,8 +3139,6 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); - drbd_md_io_bio_set = NULL; - drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; drbd_request_mempool = NULL; drbd_ee_cache = NULL; @@ -3325,8 +3162,6 @@ static int drbd_create_mempools(void) drbd_bm_ext_cache = NULL; drbd_al_ext_cache = NULL; drbd_pp_pool = NULL; - drbd_md_io_page_pool = NULL; - drbd_md_io_bio_set = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -3350,16 +3185,6 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ -#ifdef COMPAT_HAVE_BIOSET_CREATE - drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); - if (drbd_md_io_bio_set == NULL) - goto Enomem; -#endif - - drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); - if (drbd_md_io_page_pool == NULL) - goto Enomem; - drbd_request_mempool = mempool_create(number, mempool_alloc_slab, mempool_free_slab, drbd_request_cache); if (drbd_request_mempool == NULL) @@ -3437,8 +3262,6 @@ static void drbd_delete_device(unsigned int minor) if (!mdev) return; - del_timer_sync(&mdev->request_timer); - /* paranoia asserts */ if (mdev->open_cnt != 0) dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, @@ -3843,10 +3666,8 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!get_ldev_if_state(mdev, D_FAILED)) return; - buffer = drbd_md_get_buffer(mdev); - if (!buffer) - goto out; - + mutex_lock(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); memset(buffer, 0, 512); buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); @@ -3877,8 +3698,7 @@ void drbd_md_sync(struct drbd_conf *mdev) * since we updated it on metadata. */ mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); - drbd_md_put_buffer(mdev); -out: + mutex_unlock(&mdev->md_io_mutex); put_ldev(mdev); } @@ -3898,9 +3718,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!get_ldev_if_state(mdev, D_ATTACHING)) return ERR_IO_MD_DISK; - buffer = drbd_md_get_buffer(mdev); - if (!buffer) - goto out; + mutex_lock(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -3961,8 +3780,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) mdev->sync_conf.al_extents = 127; err: - drbd_md_put_buffer(mdev); - out: + mutex_unlock(&mdev->md_io_mutex); put_ldev(mdev); return rv; @@ -4365,11 +4183,12 @@ const char *drbd_buildtag(void) static char buildtag[38] = "\0uilt-in"; if (buildtag[0] == 0) { -#ifdef MODULE - sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); -#else - buildtag[0] = 'b'; +#ifdef CONFIG_MODULES + if (THIS_MODULE != NULL) + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); + else #endif + buildtag[0] = 'b'; } return buildtag; diff --git a/trunk/drivers/block/drbd/drbd_nl.c b/trunk/drivers/block/drbd/drbd_nl.c index 6d4de6a72e80..946166e13953 100644 --- a/trunk/drivers/block/drbd/drbd_nl.c +++ b/trunk/drivers/block/drbd/drbd_nl.c @@ -289,7 +289,7 @@ static int _try_outdate_peer_async(void *data) */ spin_lock_irq(&mdev->req_lock); ns = mdev->state; - if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) { + if (ns.conn < C_WF_REPORT_PARAMS) { ns.pdsk = nps; _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); } @@ -432,7 +432,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) /* if this was forced, we should consider sync */ if (forced) drbd_send_uuids(mdev); - drbd_send_current_state(mdev); + drbd_send_state(mdev); } drbd_md_sync(mdev); @@ -845,10 +845,9 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { - if (mdev->agreed_pro_version < 94) { - peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); - /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ - } else if (mdev->agreed_pro_version == 94) + if (mdev->agreed_pro_version < 94) + peer = mdev->peer_max_bio_size; + else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; else /* drbd 8.3.8 onwards */ peer = DRBD_MAX_BIO_SIZE; @@ -1033,7 +1032,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", (unsigned long long) drbd_get_max_capacity(nbc), (unsigned long long) nbc->dc.disk_size); - retcode = ERR_DISK_TOO_SMALL; + retcode = ERR_DISK_TO_SMALL; goto fail; } @@ -1047,7 +1046,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { - retcode = ERR_MD_DISK_TOO_SMALL; + retcode = ERR_MD_DISK_TO_SMALL; dev_warn(DEV, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); @@ -1058,7 +1057,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * (we may currently be R_PRIMARY with no local disk...) */ if (drbd_get_max_capacity(nbc) < drbd_get_capacity(mdev->this_bdev)) { - retcode = ERR_DISK_TOO_SMALL; + retcode = ERR_DISK_TO_SMALL; goto fail; } @@ -1139,7 +1138,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { dev_warn(DEV, "refusing to truncate a consistent device\n"); - retcode = ERR_DISK_TOO_SMALL; + retcode = ERR_DISK_TO_SMALL; goto force_diskless_dec; } @@ -1337,34 +1336,17 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, { enum drbd_ret_code retcode; int ret; - struct detach dt = {}; - - if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { - reply->ret_code = ERR_MANDATORY_TAG; - goto out; - } - - if (dt.detach_force) { - drbd_force_state(mdev, NS(disk, D_FAILED)); - reply->ret_code = SS_SUCCESS; - goto out; - } - drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ - drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */ retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); - drbd_md_put_buffer(mdev); /* D_FAILED will transition to DISKLESS. */ ret = wait_event_interruptible(mdev->misc_wait, mdev->state.disk != D_FAILED); drbd_resume_io(mdev); - if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) retcode = ERR_INTR; reply->ret_code = retcode; -out: return 0; } @@ -1729,7 +1711,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, if (rs.no_resync && mdev->agreed_pro_version < 93) { retcode = ERR_NEED_APV_93; - goto fail_ldev; + goto fail; } if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) @@ -1756,10 +1738,6 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, fail: reply->ret_code = retcode; return 0; - - fail_ldev: - put_ldev(mdev); - goto fail; } static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, @@ -1963,7 +1941,6 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ - drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -1982,7 +1959,6 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); } - drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -2004,7 +1980,6 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ - drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); @@ -2023,7 +1998,6 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re } else retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); } - drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -2196,13 +2170,11 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, /* If there is still bitmap IO pending, e.g. previous resync or verify * just being finished, wait for it before requesting a new resync. */ - drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); /* w_make_ov_request expects position to be aligned */ mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); - drbd_resume_io(mdev); return 0; } diff --git a/trunk/drivers/block/drbd/drbd_proc.c b/trunk/drivers/block/drbd/drbd_proc.c index 869bada2ed06..2959cdfb77f5 100644 --- a/trunk/drivers/block/drbd/drbd_proc.c +++ b/trunk/drivers/block/drbd/drbd_proc.c @@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) if (unlikely(v >= 1000000)) { /* cool: > GiByte/s */ seq_printf(seq, "%ld,", v / 1000000); - v %= 1000000; + v /= 1000000; seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); } else if (likely(v >= 1000)) seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); diff --git a/trunk/drivers/block/drbd/drbd_receiver.c b/trunk/drivers/block/drbd/drbd_receiver.c index ea4836e0ae98..436f519bed1c 100644 --- a/trunk/drivers/block/drbd/drbd_receiver.c +++ b/trunk/drivers/block/drbd/drbd_receiver.c @@ -466,7 +466,6 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what, goto out; } (*newsock)->ops = sock->ops; - __module_get((*newsock)->ops->owner); out: return err; @@ -751,7 +750,6 @@ static int drbd_connect(struct drbd_conf *mdev) { struct socket *s, *sock, *msock; int try, h, ok; - enum drbd_state_rv rv; D_ASSERT(!mdev->data.socket); @@ -890,32 +888,25 @@ static int drbd_connect(struct drbd_conf *mdev) } } + if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) + return 0; + sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; atomic_set(&mdev->packet_seq, 0); mdev->peer_seq = 0; + drbd_thread_start(&mdev->asender); + if (drbd_send_protocol(mdev) == -1) return -1; - set_bit(STATE_SENT, &mdev->flags); drbd_send_sync_param(mdev, &mdev->sync_conf); drbd_send_sizes(mdev, 0, 0); drbd_send_uuids(mdev); - drbd_send_current_state(mdev); + drbd_send_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); - - spin_lock_irq(&mdev->req_lock); - rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); - if (mdev->state.conn != C_WF_REPORT_PARAMS) - clear_bit(STATE_SENT, &mdev->flags); - spin_unlock_irq(&mdev->req_lock); - - if (rv < SS_SUCCESS) - return 0; - - drbd_thread_start(&mdev->asender); mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ return 1; @@ -966,7 +957,7 @@ static void drbd_flush(struct drbd_conf *mdev) rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, NULL); if (rv) { - dev_info(DEV, "local disk flush failed with status %d\n", rv); + dev_err(DEV, "local disk flush failed with status %d\n", rv); /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ @@ -1010,14 +1001,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, if (epoch_size != 0 && atomic_read(&epoch->active) == 0 && - (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { + test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) { if (!(ev & EV_CLEANUP)) { spin_unlock(&mdev->epoch_lock); drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); spin_lock(&mdev->epoch_lock); } - if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) - dec_unacked(mdev); + dec_unacked(mdev); if (mdev->current_epoch != epoch) { next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); @@ -1106,11 +1096,7 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the - * request in more than one bio. - * - * Plain bio_alloc is good enough here, this is no DRBD internally - * generated bio, but a bio allocated on behalf of the peer. - */ + * request in more than one bio. */ next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { @@ -1597,24 +1583,6 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u return ok; } -static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) -{ - - struct drbd_epoch_entry *rs_e; - bool rv = 0; - - spin_lock_irq(&mdev->req_lock); - list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { - if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { - rv = 1; - break; - } - } - spin_unlock_irq(&mdev->req_lock); - - return rv; -} - /* Called from receive_Data. * Synchronize packets on sock with packets on msock. * @@ -1858,9 +1826,6 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned list_add(&e->w.list, &mdev->active_ee); spin_unlock_irq(&mdev->req_lock); - if (mdev->state.conn == C_SYNC_TARGET) - wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); - switch (mdev->net_conf->wire_protocol) { case DRBD_PROT_C: inc_unacked(mdev); @@ -2455,7 +2420,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; - dev_info(DEV, "Lost last syncUUID packet, corrected:\n"); + dev_info(DEV, "Did not got last syncUUID packet, corrected:\n"); drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); return -1; @@ -2841,10 +2806,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi if (apv >= 88) { if (apv == 88) { - if (data_size > SHARED_SECRET_MAX || data_size == 0) { - dev_err(DEV, "verify-alg of wrong size, " - "peer wants %u, accepting only up to %u byte\n", - data_size, SHARED_SECRET_MAX); + if (data_size > SHARED_SECRET_MAX) { + dev_err(DEV, "verify-alg too long, " + "peer wants %u, accepting only %u byte\n", + data_size, SHARED_SECRET_MAX); return false; } @@ -3203,20 +3168,9 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); - /* If some other part of the code (asender thread, timeout) - * already decided to close the connection again, - * we must not "re-establish" it here. */ - if (os.conn <= C_TEAR_DOWN) - return false; - - /* If this is the "end of sync" confirmation, usually the peer disk - * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits - * set) resync started in PausedSyncT, or if the timing of pause-/ - * unpause-sync events has been "just right", the peer disk may - * transition from D_CONSISTENT to D_UP_TO_DATE as well. - */ - if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && - real_peer_disk == D_UP_TO_DATE && + /* peer says his disk is uptodate, while we think it is inconsistent, + * and this happens while we think we have a sync going on. */ + if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE && os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { /* If we are (becoming) SyncSource, but peer is still in sync * preparation, ignore its uptodate-ness to avoid flapping, it @@ -3334,7 +3288,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned /* Nowadays only used when forcing a node into primary role and setting its disk to UpToDate with that */ drbd_send_uuids(mdev); - drbd_send_current_state(mdev); + drbd_send_state(mdev); } } @@ -3822,13 +3776,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) if (mdev->state.conn == C_STANDALONE) return; - /* We are about to start the cleanup after connection loss. - * Make sure drbd_make_request knows about that. - * Usually we should be in some network failure state already, - * but just in case we are not, we fix it up here. - */ - drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); - /* asender does not clean up anything. it must not interfere, either */ drbd_thread_stop(&mdev->asender); drbd_free_sock(mdev); @@ -3856,6 +3803,8 @@ static void drbd_disconnect(struct drbd_conf *mdev) atomic_set(&mdev->rs_pending_cnt, 0); wake_up(&mdev->misc_wait); + del_timer(&mdev->request_timer); + /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); resync_timer_fn((unsigned long)mdev); @@ -4484,7 +4433,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) if (mdev->state.conn == C_AHEAD && atomic_read(&mdev->ap_in_flight) == 0 && - !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) { mdev->start_resync_timer.expires = jiffies + HZ; add_timer(&mdev->start_resync_timer); } diff --git a/trunk/drivers/block/drbd/drbd_req.c b/trunk/drivers/block/drbd/drbd_req.c index 9c5c84946b05..4a0f314086e5 100644 --- a/trunk/drivers/block/drbd/drbd_req.c +++ b/trunk/drivers/block/drbd/drbd_req.c @@ -37,7 +37,6 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req const int rw = bio_data_dir(bio); int cpu; cpu = part_stat_lock(); - part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); part_inc_in_flight(&mdev->vdisk->part0, rw); @@ -215,7 +214,8 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) { const unsigned long s = req->rq_state; struct drbd_conf *mdev = req->mdev; - int rw = req->rq_state & RQ_WRITE ? WRITE : READ; + /* only WRITES may end up here without a master bio (on barrier ack) */ + int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; /* we must not complete the master bio, while it is * still being processed by _drbd_send_zc_bio (drbd_send_dblock) @@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) return; if (s & RQ_NET_PENDING) return; - if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) + if (s & RQ_LOCAL_PENDING) return; if (req->master_bio) { @@ -277,9 +277,6 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) req->master_bio = NULL; } - if (s & RQ_LOCAL_PENDING) - return; - if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { /* this is disconnected (local only) operation, * or protocol C P_WRITE_ACK, @@ -432,7 +429,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case completed_ok: - if (req->rq_state & RQ_WRITE) + if (bio_data_dir(req->master_bio) == WRITE) mdev->writ_cnt += req->size>>9; else mdev->read_cnt += req->size>>9; @@ -441,14 +438,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - break; - - case abort_disk_io: - req->rq_state |= RQ_LOCAL_ABORTED; - if (req->rq_state & RQ_WRITE) - _req_may_be_done_not_susp(req, m); - else - goto goto_queue_for_net_read; + put_ldev(mdev); break; case write_completed_with_error: @@ -457,6 +447,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, __drbd_chk_io_error(mdev, false); _req_may_be_done_not_susp(req, m); + put_ldev(mdev); break; case read_ahead_completed_with_error: @@ -464,6 +455,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); + put_ldev(mdev); break; case read_completed_with_error: @@ -475,8 +467,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(!(req->rq_state & RQ_NET_MASK)); __drbd_chk_io_error(mdev, false); - - goto_queue_for_net_read: + put_ldev(mdev); /* no point in retrying if there is no good remote data, * or we have no connection. */ @@ -565,8 +556,10 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, drbd_queue_work(&mdev->data.work, &req->w); break; - case read_retry_remote_canceled: + case oos_handed_to_network: + /* actually the same */ case send_canceled: + /* treat it the same */ case send_failed: /* real cleanup will be done from tl_clear. just update flags * so it is no longer marked as on the worker queue */ @@ -596,17 +589,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, } req->rq_state &= ~RQ_NET_QUEUED; req->rq_state |= RQ_NET_SENT; + /* because _drbd_send_zc_bio could sleep, and may want to + * dereference the bio even after the "write_acked_by_peer" and + * "completed_ok" events came in, once we return from + * _drbd_send_zc_bio (drbd_send_dblock), we have to check + * whether it is done already, and end it. */ _req_may_be_done_not_susp(req, m); break; - case oos_handed_to_network: - /* Was not set PENDING, no longer QUEUED, so is now DONE - * as far as this connection is concerned. */ + case read_retry_remote_canceled: req->rq_state &= ~RQ_NET_QUEUED; - req->rq_state |= RQ_NET_DONE; - _req_may_be_done_not_susp(req, m); - break; - + /* fall through, in case we raced with drbd_disconnect */ case connection_lost_while_pending: /* transfer log cleanup after connection loss */ /* assert something? */ @@ -623,6 +616,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done(req, m); /* Allowed while state.susp */ break; + case write_acked_by_peer_and_sis: + req->rq_state |= RQ_NET_SIS; case conflict_discarded_by_peer: /* for discarded conflicting writes of multiple primaries, * there is no need to keep anything in the tl, potential @@ -633,15 +628,18 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, (unsigned long long)req->sector, req->size); req->rq_state |= RQ_NET_DONE; /* fall through */ - case write_acked_by_peer_and_sis: case write_acked_by_peer: - if (what == write_acked_by_peer_and_sis) - req->rq_state |= RQ_NET_SIS; /* protocol C; successfully written on peer. - * Nothing more to do here. + * Nothing to do here. * We want to keep the tl in place for all protocols, to cater - * for volatile write-back caches on lower level devices. */ + * for volatile write-back caches on lower level devices. + * + * A barrier request is expected to have forced all prior + * requests onto stable storage, so completion of a barrier + * request could set NET_DONE right here, and not wait for the + * P_BARRIER_ACK, but that is an unnecessary optimization. */ + /* this makes it effectively the same as for: */ case recv_acked_by_peer: /* protocol B; pretends to be successfully written on peer. * see also notes above in handed_over_to_network about @@ -775,7 +773,6 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns int local, remote, send_oos = 0; int err = -EIO; int ret = 0; - union drbd_state s; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -837,9 +834,8 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns drbd_al_begin_io(mdev, sector); } - s = mdev->state; - remote = remote && drbd_should_do_remote(s); - send_oos = rw == WRITE && drbd_should_send_oos(s); + remote = remote && drbd_should_do_remote(mdev->state); + send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); D_ASSERT(!(remote && send_oos)); if (!(local || remote) && !is_susp(mdev->state)) { @@ -871,7 +867,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns if (is_susp(mdev->state)) { /* If we got suspended, use the retry mechanism of - drbd_make_request() to restart processing of this + generic_make_request() to restart processing of this bio. In the next call to drbd_make_request we sleep in inc_ap_bio() */ ret = 1; @@ -1095,6 +1091,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) */ D_ASSERT(bio->bi_size > 0); D_ASSERT((bio->bi_size & 0x1ff) == 0); + D_ASSERT(bio->bi_idx == 0); /* to make some things easier, force alignment of requests within the * granularity of our hash tables */ @@ -1102,9 +1099,8 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; if (likely(s_enr == e_enr)) { - do { - inc_ap_bio(mdev, 1); - } while (drbd_make_request_common(mdev, bio, start_time)); + inc_ap_bio(mdev, 1); + drbd_make_request_common(mdev, bio, start_time); return; } @@ -1200,66 +1196,36 @@ void request_timer_fn(unsigned long data) struct drbd_conf *mdev = (struct drbd_conf *) data; struct drbd_request *req; /* oldest request */ struct list_head *le; - unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ - unsigned long now; + unsigned long et = 0; /* effective timeout = ko_count * timeout */ if (get_net_conf(mdev)) { - if (mdev->state.conn >= C_WF_REPORT_PARAMS) - ent = mdev->net_conf->timeout*HZ/10 - * mdev->net_conf->ko_count; + et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; put_net_conf(mdev); } - if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ - dt = mdev->ldev->dc.disk_timeout * HZ / 10; - put_ldev(mdev); - } - et = min_not_zero(dt, ent); - - if (!et) + if (!et || mdev->state.conn < C_WF_REPORT_PARAMS) return; /* Recurring timer stopped */ - now = jiffies; - spin_lock_irq(&mdev->req_lock); le = &mdev->oldest_tle->requests; if (list_empty(le)) { spin_unlock_irq(&mdev->req_lock); - mod_timer(&mdev->request_timer, now + et); + mod_timer(&mdev->request_timer, jiffies + et); return; } le = le->prev; req = list_entry(le, struct drbd_request, tl_requests); - - /* The request is considered timed out, if - * - we have some effective timeout from the configuration, - * with above state restrictions applied, - * - the oldest request is waiting for a response from the network - * resp. the local disk, - * - the oldest request is in fact older than the effective timeout, - * - the connection was established (resp. disk was attached) - * for longer than the timeout already. - * Note that for 32bit jiffies and very stable connections/disks, - * we may have a wrap around, which is catched by - * !time_in_range(now, last_..._jif, last_..._jif + timeout). - * - * Side effect: once per 32bit wrap-around interval, which means every - * ~198 days with 250 HZ, we have a window where the timeout would need - * to expire twice (worst case) to become effective. Good enough. - */ - if (ent && req->rq_state & RQ_NET_PENDING && - time_after(now, req->start_time + ent) && - !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { - dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); - _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); - } - if (dt && req->rq_state & RQ_LOCAL_PENDING && - time_after(now, req->start_time + dt) && - !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { - dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); - __drbd_chk_io_error(mdev, 1); + if (time_is_before_eq_jiffies(req->start_time + et)) { + if (req->rq_state & RQ_NET_PENDING) { + dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); + _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL); + } else { + dev_warn(DEV, "Local backing block device frozen?\n"); + mod_timer(&mdev->request_timer, jiffies + et); + } + } else { + mod_timer(&mdev->request_timer, req->start_time + et); } - nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; + spin_unlock_irq(&mdev->req_lock); - mod_timer(&mdev->request_timer, nt); } diff --git a/trunk/drivers/block/drbd/drbd_req.h b/trunk/drivers/block/drbd/drbd_req.h index 3d2111919486..68a234a5fdc5 100644 --- a/trunk/drivers/block/drbd/drbd_req.h +++ b/trunk/drivers/block/drbd/drbd_req.h @@ -105,7 +105,6 @@ enum drbd_req_event { read_completed_with_error, read_ahead_completed_with_error, write_completed_with_error, - abort_disk_io, completed_ok, resend, fail_frozen_disk_io, @@ -119,21 +118,18 @@ enum drbd_req_event { * same time, so we should hold the request lock anyways. */ enum drbd_req_state_bits { - /* 3210 - * 0000: no local possible - * 0001: to be submitted + /* 210 + * 000: no local possible + * 001: to be submitted * UNUSED, we could map: 011: submitted, completion still pending - * 0110: completed ok - * 0010: completed with error - * 1001: Aborted (before completion) - * 1x10: Aborted and completed -> free + * 110: completed ok + * 010: completed with error */ __RQ_LOCAL_PENDING, __RQ_LOCAL_COMPLETED, __RQ_LOCAL_OK, - __RQ_LOCAL_ABORTED, - /* 87654 + /* 76543 * 00000: no network possible * 00001: to be send * 00011: to be send, on worker queue @@ -203,9 +199,8 @@ enum drbd_req_state_bits { #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) -#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED) -#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1) +#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) diff --git a/trunk/drivers/block/drbd/drbd_worker.c b/trunk/drivers/block/drbd/drbd_worker.c index 620c70ff2231..4d3e6f6213ba 100644 --- a/trunk/drivers/block/drbd/drbd_worker.c +++ b/trunk/drivers/block/drbd/drbd_worker.c @@ -70,29 +70,11 @@ rwlock_t global_state_lock; void drbd_md_io_complete(struct bio *bio, int error) { struct drbd_md_io *md_io; - struct drbd_conf *mdev; md_io = (struct drbd_md_io *)bio->bi_private; - mdev = container_of(md_io, struct drbd_conf, md_io); - md_io->error = error; - /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able - * to timeout on the lower level device, and eventually detach from it. - * If this io completion runs after that timeout expired, this - * drbd_md_put_buffer() may allow us to finally try and re-attach. - * During normal operation, this only puts that extra reference - * down to 1 again. - * Make sure we first drop the reference, and only then signal - * completion, or we may (in drbd_al_read_log()) cycle so fast into the - * next drbd_md_sync_page_io(), that we trigger the - * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there. - */ - drbd_md_put_buffer(mdev); - md_io->done = 1; - wake_up(&mdev->misc_wait); - bio_put(bio); - put_ldev(mdev); + complete(&md_io->event); } /* reads on behalf of the partner, @@ -244,7 +226,6 @@ void drbd_endio_pri(struct bio *bio, int error) spin_lock_irqsave(&mdev->req_lock, flags); __req_mod(req, what, &m); spin_unlock_irqrestore(&mdev->req_lock, flags); - put_ldev(mdev); if (m.bio) complete_master_bio(mdev, &m); @@ -309,7 +290,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * sg_init_table(&sg, 1); crypto_hash_init(&desc); - bio_for_each_segment(bvec, bio, i) { + __bio_for_each_segment(bvec, bio, i, 0) { sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); crypto_hash_update(&desc, &sg, sg.length); } @@ -747,7 +728,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } drbd_start_resync(mdev, C_SYNC_SOURCE); - clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); + clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags); return 1; } @@ -1538,14 +1519,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) } drbd_state_lock(mdev); - write_lock_irq(&global_state_lock); + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { - write_unlock_irq(&global_state_lock); drbd_state_unlock(mdev); return; } - ns.i = mdev->state.i; + write_lock_irq(&global_state_lock); + ns = mdev->state; ns.aftr_isp = !_drbd_may_sync_now(mdev); diff --git a/trunk/drivers/block/floppy.c b/trunk/drivers/block/floppy.c index cce7df367b79..b0b00d70c166 100644 --- a/trunk/drivers/block/floppy.c +++ b/trunk/drivers/block/floppy.c @@ -551,7 +551,7 @@ static void floppy_ready(void); static void floppy_start(void); static void process_fd_request(void); static void recalibrate_floppy(void); -static void floppy_shutdown(struct work_struct *); +static void floppy_shutdown(unsigned long); static int floppy_request_regions(int); static void floppy_release_regions(int); @@ -588,8 +588,6 @@ static int buffer_max = -1; static struct floppy_fdc_state fdc_state[N_FDC]; static int fdc; /* current fdc */ -static struct workqueue_struct *floppy_wq; - static struct floppy_struct *_floppy = floppy_type; static unsigned char current_drive; static long current_count_sectors; @@ -631,15 +629,16 @@ static inline void set_debugt(void) { } static inline void debugt(const char *func, const char *msg) { } #endif /* DEBUGT */ +typedef void (*timeout_fn)(unsigned long); +static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0); -static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown); static const char *timeout_message; static void is_alive(const char *func, const char *message) { /* this routine checks whether the floppy driver is "alive" */ if (test_bit(0, &fdc_busy) && command_status < 2 && - !delayed_work_pending(&fd_timeout)) { + !timer_pending(&fd_timeout)) { DPRINT("%s: timeout handler died. %s\n", func, message); } } @@ -667,18 +666,15 @@ static int output_log_pos; static void __reschedule_timeout(int drive, const char *message) { - unsigned long delay; - if (drive == current_reqD) drive = current_drive; - + del_timer(&fd_timeout); if (drive < 0 || drive >= N_DRIVE) { - delay = 20UL * HZ; + fd_timeout.expires = jiffies + 20UL * HZ; drive = 0; } else - delay = UDP->timeout; - - queue_delayed_work(floppy_wq, &fd_timeout, delay); + fd_timeout.expires = jiffies + UDP->timeout; + add_timer(&fd_timeout); if (UDP->flags & FD_DEBUG) DPRINT("reschedule timeout %s\n", message); timeout_message = message; @@ -876,7 +872,7 @@ static int lock_fdc(int drive, bool interruptible) command_status = FD_COMMAND_NONE; - reschedule_timeout(drive, "lock fdc"); + __reschedule_timeout(drive, "lock fdc"); set_fdc(drive); return 0; } @@ -884,15 +880,23 @@ static int lock_fdc(int drive, bool interruptible) /* unlocks the driver */ static void unlock_fdc(void) { + unsigned long flags; + + raw_cmd = NULL; if (!test_bit(0, &fdc_busy)) DPRINT("FDC access conflict!\n"); - raw_cmd = NULL; + if (do_floppy) + DPRINT("device interrupt still active at FDC release: %pf!\n", + do_floppy); command_status = FD_COMMAND_NONE; - __cancel_delayed_work(&fd_timeout); - do_floppy = NULL; + spin_lock_irqsave(&floppy_lock, flags); + del_timer(&fd_timeout); cont = NULL; clear_bit(0, &fdc_busy); + if (current_req || set_next_request()) + do_fd_request(current_req->q); + spin_unlock_irqrestore(&floppy_lock, flags); wake_up(&fdc_wait); } @@ -964,24 +968,26 @@ static DECLARE_WORK(floppy_work, NULL); static void schedule_bh(void (*handler)(void)) { - WARN_ON(work_pending(&floppy_work)); - PREPARE_WORK(&floppy_work, (work_func_t)handler); - queue_work(floppy_wq, &floppy_work); + schedule_work(&floppy_work); } -static DECLARE_DELAYED_WORK(fd_timer, NULL); +static DEFINE_TIMER(fd_timer, NULL, 0, 0); static void cancel_activity(void) { + unsigned long flags; + + spin_lock_irqsave(&floppy_lock, flags); do_floppy = NULL; - cancel_delayed_work_sync(&fd_timer); - cancel_work_sync(&floppy_work); + PREPARE_WORK(&floppy_work, (work_func_t)empty); + del_timer(&fd_timer); + spin_unlock_irqrestore(&floppy_lock, flags); } /* this function makes sure that the disk stays in the drive during the * transfer */ -static void fd_watchdog(struct work_struct *arg) +static void fd_watchdog(void) { debug_dcl(DP->flags, "calling disk change from watchdog\n"); @@ -991,20 +997,21 @@ static void fd_watchdog(struct work_struct *arg) cont->done(0); reset_fdc(); } else { - cancel_delayed_work(&fd_timer); - PREPARE_DELAYED_WORK(&fd_timer, fd_watchdog); - queue_delayed_work(floppy_wq, &fd_timer, HZ / 10); + del_timer(&fd_timer); + fd_timer.function = (timeout_fn)fd_watchdog; + fd_timer.expires = jiffies + HZ / 10; + add_timer(&fd_timer); } } static void main_command_interrupt(void) { - cancel_delayed_work(&fd_timer); + del_timer(&fd_timer); cont->interrupt(); } /* waits for a delay (spinup or select) to pass */ -static int fd_wait_for_completion(unsigned long expires, work_func_t function) +static int fd_wait_for_completion(unsigned long delay, timeout_fn function) { if (FDCS->reset) { reset_fdc(); /* do the reset during sleep to win time @@ -1013,10 +1020,11 @@ static int fd_wait_for_completion(unsigned long expires, work_func_t function) return 1; } - if (time_before(jiffies, expires)) { - cancel_delayed_work(&fd_timer); - PREPARE_DELAYED_WORK(&fd_timer, function); - queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies); + if (time_before(jiffies, delay)) { + del_timer(&fd_timer); + fd_timer.function = function; + fd_timer.expires = delay; + add_timer(&fd_timer); return 1; } return 0; @@ -1334,7 +1342,7 @@ static int fdc_dtr(void) */ FDCS->dtr = raw_cmd->rate & 3; return fd_wait_for_completion(jiffies + 2UL * HZ / 100, - (work_func_t)floppy_ready); + (timeout_fn)floppy_ready); } /* fdc_dtr */ static void tell_sector(void) @@ -1439,7 +1447,7 @@ static void setup_rw_floppy(void) int flags; int dflags; unsigned long ready_date; - work_func_t function; + timeout_fn function; flags = raw_cmd->flags; if (flags & (FD_RAW_READ | FD_RAW_WRITE)) @@ -1453,9 +1461,9 @@ static void setup_rw_floppy(void) */ if (time_after(ready_date, jiffies + DP->select_delay)) { ready_date -= DP->select_delay; - function = (work_func_t)floppy_start; + function = (timeout_fn)floppy_start; } else - function = (work_func_t)setup_rw_floppy; + function = (timeout_fn)setup_rw_floppy; /* wait until the floppy is spinning fast enough */ if (fd_wait_for_completion(ready_date, function)) @@ -1485,7 +1493,7 @@ static void setup_rw_floppy(void) inr = result(); cont->interrupt(); } else if (flags & FD_RAW_NEED_DISK) - fd_watchdog(NULL); + fd_watchdog(); } static int blind_seek; @@ -1794,22 +1802,20 @@ static void show_floppy(void) pr_info("do_floppy=%pf\n", do_floppy); if (work_pending(&floppy_work)) pr_info("floppy_work.func=%pf\n", floppy_work.func); - if (delayed_work_pending(&fd_timer)) - pr_info("delayed work.function=%p expires=%ld\n", - fd_timer.work.func, - fd_timer.timer.expires - jiffies); - if (delayed_work_pending(&fd_timeout)) - pr_info("timer_function=%p expires=%ld\n", - fd_timeout.work.func, - fd_timeout.timer.expires - jiffies); - + if (timer_pending(&fd_timer)) + pr_info("fd_timer.function=%pf\n", fd_timer.function); + if (timer_pending(&fd_timeout)) { + pr_info("timer_function=%pf\n", fd_timeout.function); + pr_info("expires=%lu\n", fd_timeout.expires - jiffies); + pr_info("now=%lu\n", jiffies); + } pr_info("cont=%p\n", cont); pr_info("current_req=%p\n", current_req); pr_info("command_status=%d\n", command_status); pr_info("\n"); } -static void floppy_shutdown(struct work_struct *arg) +static void floppy_shutdown(unsigned long data) { unsigned long flags; @@ -1862,7 +1868,7 @@ static int start_motor(void (*function)(void)) /* wait_for_completion also schedules reset if needed. */ return fd_wait_for_completion(DRS->select_date + DP->select_delay, - (work_func_t)function); + (timeout_fn)function); } static void floppy_ready(void) @@ -2815,6 +2821,7 @@ static void redo_fd_request(void) spin_lock_irq(&floppy_lock); pending = set_next_request(); spin_unlock_irq(&floppy_lock); + if (!pending) { do_floppy = NULL; unlock_fdc(); @@ -2891,15 +2898,13 @@ static void do_fd_request(struct request_queue *q) current_req->cmd_flags)) return; - if (test_and_set_bit(0, &fdc_busy)) { + if (test_bit(0, &fdc_busy)) { /* fdc busy, this new request will be treated when the current one is done */ is_alive(__func__, "old request running"); return; } - command_status = FD_COMMAND_NONE; - __reschedule_timeout(MAXTIMEOUT, "fd_request"); - set_fdc(0); + lock_fdc(MAXTIMEOUT, false); process_fd_request(); is_alive(__func__, ""); } @@ -3607,7 +3612,9 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) mutex_lock(&floppy_mutex); mutex_lock(&open_lock); - if (!UDRS->fd_ref--) { + if (UDRS->fd_ref < 0) + UDRS->fd_ref = 0; + else if (!UDRS->fd_ref--) { DPRINT("floppy_release with fd_ref == 0"); UDRS->fd_ref = 0; } @@ -3643,7 +3650,13 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) set_bit(FD_VERIFY_BIT, &UDRS->flags); } - UDRS->fd_ref++; + if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL))) + goto out2; + + if (mode & FMODE_EXCL) + UDRS->fd_ref = -1; + else + UDRS->fd_ref++; opened_bdev[drive] = bdev; @@ -3706,8 +3719,10 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) mutex_unlock(&floppy_mutex); return 0; out: - UDRS->fd_ref--; - + if (UDRS->fd_ref < 0) + UDRS->fd_ref = 0; + else + UDRS->fd_ref--; if (!UDRS->fd_ref) opened_bdev[drive] = NULL; out2: @@ -4144,16 +4159,10 @@ static int __init floppy_init(void) goto out_put_disk; } - floppy_wq = alloc_ordered_workqueue("floppy", 0); - if (!floppy_wq) { - err = -ENOMEM; - goto out_put_disk; - } - disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock); if (!disks[dr]->queue) { err = -ENOMEM; - goto out_destroy_workq; + goto out_put_disk; } blk_queue_max_hw_sectors(disks[dr]->queue, 64); @@ -4204,7 +4213,7 @@ static int __init floppy_init(void) use_virtual_dma = can_use_virtual_dma & 1; fdc_state[0].address = FDC1; if (fdc_state[0].address == -1) { - cancel_delayed_work(&fd_timeout); + del_timer_sync(&fd_timeout); err = -ENODEV; goto out_unreg_region; } @@ -4215,7 +4224,7 @@ static int __init floppy_init(void) fdc = 0; /* reset fdc in case of unexpected interrupt */ err = floppy_grab_irq_and_dma(); if (err) { - cancel_delayed_work(&fd_timeout); + del_timer_sync(&fd_timeout); err = -EBUSY; goto out_unreg_region; } @@ -4272,13 +4281,13 @@ static int __init floppy_init(void) user_reset_fdc(-1, FD_RESET_ALWAYS, false); } fdc = 0; - cancel_delayed_work(&fd_timeout); + del_timer_sync(&fd_timeout); current_drive = 0; initialized = true; if (have_no_fdc) { DPRINT("no floppy controllers found\n"); err = have_no_fdc; - goto out_release_dma; + goto out_flush_work; } for (drive = 0; drive < N_DRIVE; drive++) { @@ -4293,7 +4302,7 @@ static int __init floppy_init(void) err = platform_device_register(&floppy_device[drive]); if (err) - goto out_release_dma; + goto out_flush_work; err = device_create_file(&floppy_device[drive].dev, &dev_attr_cmos); @@ -4311,14 +4320,13 @@ static int __init floppy_init(void) out_unreg_platform_dev: platform_device_unregister(&floppy_device[drive]); -out_release_dma: +out_flush_work: + flush_work_sync(&floppy_work); if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); platform_driver_unregister(&floppy_driver); -out_destroy_workq: - destroy_workqueue(floppy_wq); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: @@ -4389,7 +4397,7 @@ static int floppy_grab_irq_and_dma(void) * We might have scheduled a free_irq(), wait it to * drain first: */ - flush_workqueue(floppy_wq); + flush_work_sync(&floppy_work); if (fd_request_irq()) { DPRINT("Unable to grab IRQ%d for the floppy driver\n", @@ -4480,9 +4488,9 @@ static void floppy_release_irq_and_dma(void) pr_info("motor off timer %d still active\n", drive); #endif - if (delayed_work_pending(&fd_timeout)) + if (timer_pending(&fd_timeout)) pr_info("floppy timer still active:%s\n", timeout_message); - if (delayed_work_pending(&fd_timer)) + if (timer_pending(&fd_timer)) pr_info("auxiliary floppy timer still active\n"); if (work_pending(&floppy_work)) pr_info("work still pending\n"); @@ -4552,9 +4560,8 @@ static void __exit floppy_module_exit(void) put_disk(disks[drive]); } - cancel_delayed_work_sync(&fd_timeout); - cancel_delayed_work_sync(&fd_timer); - destroy_workqueue(floppy_wq); + del_timer_sync(&fd_timeout); + del_timer_sync(&fd_timer); if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); diff --git a/trunk/drivers/block/xen-blkfront.c b/trunk/drivers/block/xen-blkfront.c index 60eed4bdd2e4..4e86393a09cf 100644 --- a/trunk/drivers/block/xen-blkfront.c +++ b/trunk/drivers/block/xen-blkfront.c @@ -526,14 +526,6 @@ static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) return 0; } -static char *encode_disk_name(char *ptr, unsigned int n) -{ - if (n >= 26) - ptr = encode_disk_name(ptr, n / 26 - 1); - *ptr = 'a' + n % 26; - return ptr + 1; -} - static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, u16 vdisk_info, u16 sector_size) @@ -544,7 +536,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, unsigned int offset; int minor; int nr_parts; - char *ptr; BUG_ON(info->gd != NULL); BUG_ON(info->rq != NULL); @@ -569,11 +560,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, "emulated IDE disks,\n\t choose an xvd device name" "from xvde on\n", info->vdevice); } - if (minor >> MINORBITS) { - pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n", - info->vdevice, minor); - return -ENODEV; - } + err = -ENODEV; if ((minor % nr_parts) == 0) nr_minors = nr_parts; @@ -587,14 +574,23 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (gd == NULL) goto release; - strcpy(gd->disk_name, DEV_NAME); - ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); - BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN); - if (nr_minors > 1) - *ptr = 0; - else - snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr, - "%d", minor & (nr_parts - 1)); + if (nr_minors > 1) { + if (offset < 26) + sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset); + else + sprintf(gd->disk_name, "%s%c%c", DEV_NAME, + 'a' + ((offset / 26)-1), 'a' + (offset % 26)); + } else { + if (offset < 26) + sprintf(gd->disk_name, "%s%c%d", DEV_NAME, + 'a' + offset, + minor & (nr_parts - 1)); + else + sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME, + 'a' + ((offset / 26) - 1), + 'a' + (offset % 26), + minor & (nr_parts - 1)); + } gd->major = XENVBD_MAJOR; gd->first_minor = minor; @@ -1500,9 +1496,7 @@ module_init(xlblk_init); static void __exit xlblk_exit(void) { - xenbus_unregister_driver(&blkfront_driver); - unregister_blkdev(XENVBD_MAJOR, DEV_NAME); - kfree(minors); + return xenbus_unregister_driver(&blkfront_driver); } module_exit(xlblk_exit); diff --git a/trunk/drivers/gpu/drm/drm_crtc.c b/trunk/drivers/gpu/drm/drm_crtc.c index 08a7aa722d6b..92cea9d77ec9 100644 --- a/trunk/drivers/gpu/drm/drm_crtc.c +++ b/trunk/drivers/gpu/drm/drm_crtc.c @@ -2116,7 +2116,7 @@ int drm_mode_addfb(struct drm_device *dev, return ret; } -static int format_check(const struct drm_mode_fb_cmd2 *r) +static int format_check(struct drm_mode_fb_cmd2 *r) { uint32_t format = r->pixel_format & ~DRM_FORMAT_BIG_ENDIAN; @@ -2185,7 +2185,7 @@ static int format_check(const struct drm_mode_fb_cmd2 *r) } } -static int framebuffer_check(const struct drm_mode_fb_cmd2 *r) +static int framebuffer_check(struct drm_mode_fb_cmd2 *r) { int ret, hsub, vsub, num_planes, i; @@ -3126,7 +3126,7 @@ int drm_mode_connector_update_edid_property(struct drm_connector *connector, EXPORT_SYMBOL(drm_mode_connector_update_edid_property); static bool drm_property_change_is_valid(struct drm_property *property, - uint64_t value) + __u64 value) { if (property->flags & DRM_MODE_PROP_IMMUTABLE) return false; @@ -3136,7 +3136,7 @@ static bool drm_property_change_is_valid(struct drm_property *property, return true; } else if (property->flags & DRM_MODE_PROP_BITMASK) { int i; - uint64_t valid_mask = 0; + __u64 valid_mask = 0; for (i = 0; i < property->num_values; i++) valid_mask |= (1ULL << property->values[i]); return !(value & ~valid_mask); diff --git a/trunk/drivers/gpu/drm/drm_edid.c b/trunk/drivers/gpu/drm/drm_edid.c index c3b5139eba7f..608bddfc7e35 100644 --- a/trunk/drivers/gpu/drm/drm_edid.c +++ b/trunk/drivers/gpu/drm/drm_edid.c @@ -66,8 +66,6 @@ #define EDID_QUIRK_FIRST_DETAILED_PREFERRED (1 << 5) /* use +hsync +vsync for detailed mode */ #define EDID_QUIRK_DETAILED_SYNC_PP (1 << 6) -/* Force reduced-blanking timings for detailed modes */ -#define EDID_QUIRK_FORCE_REDUCED_BLANKING (1 << 7) struct detailed_mode_closure { struct drm_connector *connector; @@ -122,9 +120,6 @@ static struct edid_quirk { /* Samsung SyncMaster 22[5-6]BW */ { "SAM", 596, EDID_QUIRK_PREFER_LARGE_60 }, { "SAM", 638, EDID_QUIRK_PREFER_LARGE_60 }, - - /* ViewSonic VA2026w */ - { "VSC", 5020, EDID_QUIRK_FORCE_REDUCED_BLANKING }, }; /*** DDC fetch and block validation ***/ @@ -890,19 +885,12 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, "Wrong Hsync/Vsync pulse width\n"); return NULL; } - - if (quirks & EDID_QUIRK_FORCE_REDUCED_BLANKING) { - mode = drm_cvt_mode(dev, hactive, vactive, 60, true, false, false); - if (!mode) - return NULL; - - goto set_size; - } - mode = drm_mode_create(dev); if (!mode) return NULL; + mode->type = DRM_MODE_TYPE_DRIVER; + if (quirks & EDID_QUIRK_135_CLOCK_TOO_HIGH) timing->pixel_clock = cpu_to_le16(1088); @@ -926,6 +914,8 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, drm_mode_do_interlace_quirk(mode, pt); + drm_mode_set_name(mode); + if (quirks & EDID_QUIRK_DETAILED_SYNC_PP) { pt->misc |= DRM_EDID_PT_HSYNC_POSITIVE | DRM_EDID_PT_VSYNC_POSITIVE; } @@ -935,7 +925,6 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, mode->flags |= (pt->misc & DRM_EDID_PT_VSYNC_POSITIVE) ? DRM_MODE_FLAG_PVSYNC : DRM_MODE_FLAG_NVSYNC; -set_size: mode->width_mm = pt->width_mm_lo | (pt->width_height_mm_hi & 0xf0) << 4; mode->height_mm = pt->height_mm_lo | (pt->width_height_mm_hi & 0xf) << 8; @@ -949,9 +938,6 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, mode->height_mm = edid->height_cm * 10; } - mode->type = DRM_MODE_TYPE_DRIVER; - drm_mode_set_name(mode); - return mode; } diff --git a/trunk/drivers/gpu/drm/i915/i915_debugfs.c b/trunk/drivers/gpu/drm/i915/i915_debugfs.c index 5363e9c66c27..eb2b3c25b9e1 100644 --- a/trunk/drivers/gpu/drm/i915/i915_debugfs.c +++ b/trunk/drivers/gpu/drm/i915/i915_debugfs.c @@ -2032,8 +2032,6 @@ void i915_debugfs_cleanup(struct drm_minor *minor) 1, minor); drm_debugfs_remove_files((struct drm_info_list *) &i915_ring_stop_fops, 1, minor); - drm_debugfs_remove_files((struct drm_info_list *) &i915_error_state_fops, - 1, minor); } #endif /* CONFIG_DEBUG_FS */ diff --git a/trunk/drivers/gpu/drm/i915/i915_gem.c b/trunk/drivers/gpu/drm/i915/i915_gem.c index 288d7b8f49ae..c1e5c66553df 100644 --- a/trunk/drivers/gpu/drm/i915/i915_gem.c +++ b/trunk/drivers/gpu/drm/i915/i915_gem.c @@ -2063,8 +2063,10 @@ i915_gem_object_unbind(struct drm_i915_gem_object *obj) if (obj->gtt_space == NULL) return 0; - if (obj->pin_count) - return -EBUSY; + if (obj->pin_count != 0) { + DRM_ERROR("Attempting to unbind pinned buffer\n"); + return -EINVAL; + } ret = i915_gem_object_finish_gpu(obj); if (ret) @@ -3291,7 +3293,6 @@ struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev, struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj; struct address_space *mapping; - u32 mask; obj = kzalloc(sizeof(*obj), GFP_KERNEL); if (obj == NULL) @@ -3302,15 +3303,8 @@ struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev, return NULL; } - mask = GFP_HIGHUSER | __GFP_RECLAIMABLE; - if (IS_CRESTLINE(dev) || IS_BROADWATER(dev)) { - /* 965gm cannot relocate objects above 4GiB. */ - mask &= ~__GFP_HIGHMEM; - mask |= __GFP_DMA32; - } - mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping; - mapping_set_gfp_mask(mapping, mask); + mapping_set_gfp_mask(mapping, GFP_HIGHUSER | __GFP_RECLAIMABLE); i915_gem_info_add_obj(dev_priv, size); diff --git a/trunk/drivers/gpu/drm/i915/i915_irq.c b/trunk/drivers/gpu/drm/i915/i915_irq.c index 1417660a93ec..cc4a63307611 100644 --- a/trunk/drivers/gpu/drm/i915/i915_irq.c +++ b/trunk/drivers/gpu/drm/i915/i915_irq.c @@ -350,8 +350,8 @@ static void gen6_pm_rps_work(struct work_struct *work) { drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, rps_work); + u8 new_delay = dev_priv->cur_delay; u32 pm_iir, pm_imr; - u8 new_delay; spin_lock_irq(&dev_priv->rps_lock); pm_iir = dev_priv->pm_iir; @@ -360,18 +360,41 @@ static void gen6_pm_rps_work(struct work_struct *work) I915_WRITE(GEN6_PMIMR, 0); spin_unlock_irq(&dev_priv->rps_lock); - if ((pm_iir & GEN6_PM_DEFERRED_EVENTS) == 0) + if (!pm_iir) return; mutex_lock(&dev_priv->dev->struct_mutex); - - if (pm_iir & GEN6_PM_RP_UP_THRESHOLD) - new_delay = dev_priv->cur_delay + 1; - else - new_delay = dev_priv->cur_delay - 1; + if (pm_iir & GEN6_PM_RP_UP_THRESHOLD) { + if (dev_priv->cur_delay != dev_priv->max_delay) + new_delay = dev_priv->cur_delay + 1; + if (new_delay > dev_priv->max_delay) + new_delay = dev_priv->max_delay; + } else if (pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT)) { + gen6_gt_force_wake_get(dev_priv); + if (dev_priv->cur_delay != dev_priv->min_delay) + new_delay = dev_priv->cur_delay - 1; + if (new_delay < dev_priv->min_delay) { + new_delay = dev_priv->min_delay; + I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, + I915_READ(GEN6_RP_INTERRUPT_LIMITS) | + ((new_delay << 16) & 0x3f0000)); + } else { + /* Make sure we continue to get down interrupts + * until we hit the minimum frequency */ + I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, + I915_READ(GEN6_RP_INTERRUPT_LIMITS) & ~0x3f0000); + } + gen6_gt_force_wake_put(dev_priv); + } gen6_set_rps(dev_priv->dev, new_delay); + dev_priv->cur_delay = new_delay; + /* + * rps_lock not held here because clearing is non-destructive. There is + * an *extremely* unlikely race with gen6_rps_enable() that is prevented + * by holding struct_mutex for the duration of the write. + */ mutex_unlock(&dev_priv->dev->struct_mutex); } diff --git a/trunk/drivers/gpu/drm/i915/intel_display.c b/trunk/drivers/gpu/drm/i915/intel_display.c index 914789420906..ee61ad1e642b 100644 --- a/trunk/drivers/gpu/drm/i915/intel_display.c +++ b/trunk/drivers/gpu/drm/i915/intel_display.c @@ -910,10 +910,9 @@ static void assert_pll(struct drm_i915_private *dev_priv, /* For ILK+ */ static void assert_pch_pll(struct drm_i915_private *dev_priv, - struct intel_pch_pll *pll, - struct intel_crtc *crtc, - bool state) + struct intel_crtc *intel_crtc, bool state) { + int reg; u32 val; bool cur_state; @@ -922,37 +921,30 @@ static void assert_pch_pll(struct drm_i915_private *dev_priv, return; } - if (WARN (!pll, - "asserting PCH PLL %s with no PLL\n", state_string(state))) + if (!intel_crtc->pch_pll) { + WARN(1, "asserting PCH PLL enabled with no PLL\n"); return; + } - val = I915_READ(pll->pll_reg); - cur_state = !!(val & DPLL_VCO_ENABLE); - WARN(cur_state != state, - "PCH PLL state for reg %x assertion failure (expected %s, current %s), val=%08x\n", - pll->pll_reg, state_string(state), state_string(cur_state), val); - - /* Make sure the selected PLL is correctly attached to the transcoder */ - if (crtc && HAS_PCH_CPT(dev_priv->dev)) { + if (HAS_PCH_CPT(dev_priv->dev)) { u32 pch_dpll; pch_dpll = I915_READ(PCH_DPLL_SEL); - cur_state = pll->pll_reg == _PCH_DPLL_B; - if (!WARN(((pch_dpll >> (4 * crtc->pipe)) & 1) != cur_state, - "PLL[%d] not attached to this transcoder %d: %08x\n", - cur_state, crtc->pipe, pch_dpll)) { - cur_state = !!(val >> (4*crtc->pipe + 3)); - WARN(cur_state != state, - "PLL[%d] not %s on this transcoder %d: %08x\n", - pll->pll_reg == _PCH_DPLL_B, - state_string(state), - crtc->pipe, - val); - } + + /* Make sure the selected PLL is enabled to the transcoder */ + WARN(!((pch_dpll >> (4 * intel_crtc->pipe)) & 8), + "transcoder %d PLL not enabled\n", intel_crtc->pipe); } + + reg = intel_crtc->pch_pll->pll_reg; + val = I915_READ(reg); + cur_state = !!(val & DPLL_VCO_ENABLE); + WARN(cur_state != state, + "PCH PLL state assertion failure (expected %s, current %s)\n", + state_string(state), state_string(cur_state)); } -#define assert_pch_pll_enabled(d, p, c) assert_pch_pll(d, p, c, true) -#define assert_pch_pll_disabled(d, p, c) assert_pch_pll(d, p, c, false) +#define assert_pch_pll_enabled(d, p) assert_pch_pll(d, p, true) +#define assert_pch_pll_disabled(d, p) assert_pch_pll(d, p, false) static void assert_fdi_tx(struct drm_i915_private *dev_priv, enum pipe pipe, bool state) @@ -1432,7 +1424,7 @@ static void intel_enable_pch_pll(struct intel_crtc *intel_crtc) assert_pch_refclk_enabled(dev_priv); if (pll->active++ && pll->on) { - assert_pch_pll_enabled(dev_priv, pll, NULL); + assert_pch_pll_enabled(dev_priv, intel_crtc); return; } @@ -1468,12 +1460,12 @@ static void intel_disable_pch_pll(struct intel_crtc *intel_crtc) intel_crtc->base.base.id); if (WARN_ON(pll->active == 0)) { - assert_pch_pll_disabled(dev_priv, pll, NULL); + assert_pch_pll_disabled(dev_priv, intel_crtc); return; } if (--pll->active) { - assert_pch_pll_enabled(dev_priv, pll, NULL); + assert_pch_pll_enabled(dev_priv, intel_crtc); return; } @@ -1503,9 +1495,7 @@ static void intel_enable_transcoder(struct drm_i915_private *dev_priv, BUG_ON(dev_priv->info->gen < 5); /* Make sure PCH DPLL is enabled */ - assert_pch_pll_enabled(dev_priv, - to_intel_crtc(crtc)->pch_pll, - to_intel_crtc(crtc)); + assert_pch_pll_enabled(dev_priv, to_intel_crtc(crtc)); /* FDI must be feeding us bits for PCH ports */ assert_fdi_tx_enabled(dev_priv, pipe); diff --git a/trunk/drivers/gpu/drm/i915/intel_dp.c b/trunk/drivers/gpu/drm/i915/intel_dp.c index 296cfc201a81..71c7096e3869 100644 --- a/trunk/drivers/gpu/drm/i915/intel_dp.c +++ b/trunk/drivers/gpu/drm/i915/intel_dp.c @@ -266,9 +266,6 @@ intel_dp_mode_valid(struct drm_connector *connector, if (mode->clock < 10000) return MODE_CLOCK_LOW; - if (mode->flags & DRM_MODE_FLAG_DBLCLK) - return MODE_H_ILLEGAL; - return MODE_OK; } @@ -705,9 +702,6 @@ intel_dp_mode_fixup(struct drm_encoder *encoder, struct drm_display_mode *mode, mode->clock = intel_dp->panel_fixed_mode->clock; } - if (mode->flags & DRM_MODE_FLAG_DBLCLK) - return false; - DRM_DEBUG_KMS("DP link computation with max lane count %i " "max bw %02x pixel clock %iKHz\n", max_lane_count, bws[max_clock], mode->clock); @@ -1160,10 +1154,11 @@ static void ironlake_edp_panel_off(struct intel_dp *intel_dp) DRM_DEBUG_KMS("Turn eDP power off\n"); - WARN(!intel_dp->want_panel_vdd, "Need VDD to turn off panel\n"); + WARN(intel_dp->want_panel_vdd, "Cannot turn power off while VDD is on\n"); + ironlake_panel_vdd_off_sync(intel_dp); /* finish any pending work */ pp = ironlake_get_pp_control(dev_priv); - pp &= ~(POWER_TARGET_ON | PANEL_POWER_RESET | EDP_BLC_ENABLE); + pp &= ~(POWER_TARGET_ON | EDP_FORCE_VDD | PANEL_POWER_RESET | EDP_BLC_ENABLE); I915_WRITE(PCH_PP_CONTROL, pp); POSTING_READ(PCH_PP_CONTROL); @@ -1271,16 +1266,18 @@ static void intel_dp_prepare(struct drm_encoder *encoder) { struct intel_dp *intel_dp = enc_to_intel_dp(encoder); - - /* Make sure the panel is off before trying to change the mode. But also - * ensure that we have vdd while we switch off the panel. */ - ironlake_edp_panel_vdd_on(intel_dp); ironlake_edp_backlight_off(intel_dp); ironlake_edp_panel_off(intel_dp); + /* Wake up the sink first */ + ironlake_edp_panel_vdd_on(intel_dp); intel_dp_sink_dpms(intel_dp, DRM_MODE_DPMS_ON); intel_dp_link_down(intel_dp); ironlake_edp_panel_vdd_off(intel_dp, false); + + /* Make sure the panel is off before trying to + * change the mode + */ } static void intel_dp_commit(struct drm_encoder *encoder) @@ -1312,11 +1309,10 @@ intel_dp_dpms(struct drm_encoder *encoder, int mode) uint32_t dp_reg = I915_READ(intel_dp->output_reg); if (mode != DRM_MODE_DPMS_ON) { - /* Switching the panel off requires vdd. */ - ironlake_edp_panel_vdd_on(intel_dp); ironlake_edp_backlight_off(intel_dp); ironlake_edp_panel_off(intel_dp); + ironlake_edp_panel_vdd_on(intel_dp); intel_dp_sink_dpms(intel_dp, mode); intel_dp_link_down(intel_dp); ironlake_edp_panel_vdd_off(intel_dp, false); diff --git a/trunk/drivers/gpu/drm/i915/intel_i2c.c b/trunk/drivers/gpu/drm/i915/intel_i2c.c index 1991a4408cf9..4a9707dd0f9c 100644 --- a/trunk/drivers/gpu/drm/i915/intel_i2c.c +++ b/trunk/drivers/gpu/drm/i915/intel_i2c.c @@ -396,22 +396,11 @@ gmbus_xfer(struct i2c_adapter *adapter, * Wait for bus to IDLE before clearing NAK. * If we clear the NAK while bus is still active, then it will stay * active and the next transaction may fail. - * - * If no ACK is received during the address phase of a transaction, the - * adapter must report -ENXIO. It is not clear what to return if no ACK - * is received at other times. But we have to be careful to not return - * spurious -ENXIO because that will prevent i2c and drm edid functions - * from retrying. So return -ENXIO only when gmbus properly quiescents - - * timing out seems to happen when there _is_ a ddc chip present, but - * it's slow responding and only answers on the 2nd retry. */ - ret = -ENXIO; if (wait_for((I915_READ(GMBUS2 + reg_offset) & GMBUS_ACTIVE) == 0, - 10)) { + 10)) DRM_DEBUG_KMS("GMBUS [%s] timed out after NAK\n", adapter->name); - ret = -ETIMEDOUT; - } /* Toggle the Software Clear Interrupt bit. This has the effect * of resetting the GMBUS controller and so clearing the @@ -425,6 +414,14 @@ gmbus_xfer(struct i2c_adapter *adapter, adapter->name, msgs[i].addr, (msgs[i].flags & I2C_M_RD) ? 'r' : 'w', msgs[i].len); + /* + * If no ACK is received during the address phase of a transaction, + * the adapter must report -ENXIO. + * It is not clear what to return if no ACK is received at other times. + * So, we always return -ENXIO in all NAK cases, to ensure we send + * it at least during the one case that is specified. + */ + ret = -ENXIO; goto out; timeout: diff --git a/trunk/drivers/gpu/drm/i915/intel_lvds.c b/trunk/drivers/gpu/drm/i915/intel_lvds.c index 08eb04c787e8..9dee82350def 100644 --- a/trunk/drivers/gpu/drm/i915/intel_lvds.c +++ b/trunk/drivers/gpu/drm/i915/intel_lvds.c @@ -745,14 +745,6 @@ static const struct dmi_system_id intel_no_lvds[] = { DMI_MATCH(DMI_BOARD_NAME, "AT5NM10T-I"), }, }, - { - .callback = intel_no_lvds_dmi_callback, - .ident = "Hewlett-Packard HP t5740e Thin Client", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP t5740e Thin Client"), - }, - }, { .callback = intel_no_lvds_dmi_callback, .ident = "Hewlett-Packard t5745", diff --git a/trunk/drivers/gpu/drm/i915/intel_pm.c b/trunk/drivers/gpu/drm/i915/intel_pm.c index d0ce2a5b1d3f..8e79ff67ec98 100644 --- a/trunk/drivers/gpu/drm/i915/intel_pm.c +++ b/trunk/drivers/gpu/drm/i915/intel_pm.c @@ -2270,33 +2270,10 @@ void ironlake_disable_drps(struct drm_device *dev) void gen6_set_rps(struct drm_device *dev, u8 val) { struct drm_i915_private *dev_priv = dev->dev_private; - u32 limits; + u32 swreq; - limits = 0; - if (val >= dev_priv->max_delay) - val = dev_priv->max_delay; - else - limits |= dev_priv->max_delay << 24; - - if (val <= dev_priv->min_delay) - val = dev_priv->min_delay; - else - limits |= dev_priv->min_delay << 16; - - if (val == dev_priv->cur_delay) - return; - - I915_WRITE(GEN6_RPNSWREQ, - GEN6_FREQUENCY(val) | - GEN6_OFFSET(0) | - GEN6_AGGRESSIVE_TURBO); - - /* Make sure we continue to get interrupts - * until we hit the minimum or maximum frequencies. - */ - I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, limits); - - dev_priv->cur_delay = val; + swreq = (val & 0x3ff) << 25; + I915_WRITE(GEN6_RPNSWREQ, swreq); } void gen6_disable_rps(struct drm_device *dev) @@ -2350,10 +2327,11 @@ int intel_enable_rc6(const struct drm_device *dev) void gen6_enable_rps(struct drm_i915_private *dev_priv) { struct intel_ring_buffer *ring; - u32 rp_state_cap; - u32 gt_perf_status; + u32 rp_state_cap = I915_READ(GEN6_RP_STATE_CAP); + u32 gt_perf_status = I915_READ(GEN6_GT_PERF_STATUS); u32 pcu_mbox, rc6_mask = 0; u32 gtfifodbg; + int cur_freq, min_freq, max_freq; int rc6_mode; int i; @@ -2374,14 +2352,6 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) gen6_gt_force_wake_get(dev_priv); - rp_state_cap = I915_READ(GEN6_RP_STATE_CAP); - gt_perf_status = I915_READ(GEN6_GT_PERF_STATUS); - - /* In units of 100MHz */ - dev_priv->max_delay = rp_state_cap & 0xff; - dev_priv->min_delay = (rp_state_cap & 0xff0000) >> 16; - dev_priv->cur_delay = 0; - /* disable the counters and set deterministic thresholds */ I915_WRITE(GEN6_RC_CONTROL, 0); @@ -2429,8 +2399,8 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 1000000); I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, - dev_priv->max_delay << 24 | - dev_priv->min_delay << 16); + 18 << 24 | + 6 << 16); I915_WRITE(GEN6_RP_UP_THRESHOLD, 10000); I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 1000000); I915_WRITE(GEN6_RP_UP_EI, 100000); @@ -2438,7 +2408,7 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10); I915_WRITE(GEN6_RP_CONTROL, GEN6_RP_MEDIA_TURBO | - GEN6_RP_MEDIA_HW_NORMAL_MODE | + GEN6_RP_MEDIA_HW_MODE | GEN6_RP_MEDIA_IS_GFX | GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG | @@ -2456,6 +2426,10 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) 500)) DRM_ERROR("timeout waiting for pcode mailbox to finish\n"); + min_freq = (rp_state_cap & 0xff0000) >> 16; + max_freq = rp_state_cap & 0xff; + cur_freq = (gt_perf_status & 0xff00) >> 8; + /* Check for overclock support */ if (wait_for((I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) == 0, 500)) @@ -2466,11 +2440,14 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) 500)) DRM_ERROR("timeout waiting for pcode mailbox to finish\n"); if (pcu_mbox & (1<<31)) { /* OC supported */ - dev_priv->max_delay = pcu_mbox & 0xff; + max_freq = pcu_mbox & 0xff; DRM_DEBUG_DRIVER("overclocking supported, adjusting frequency max to %dMHz\n", pcu_mbox * 50); } - gen6_set_rps(dev_priv->dev, (gt_perf_status & 0xff00) >> 8); + /* In units of 100MHz */ + dev_priv->max_delay = max_freq; + dev_priv->min_delay = min_freq; + dev_priv->cur_delay = cur_freq; /* requires MSI enabled */ I915_WRITE(GEN6_PMIER, @@ -3603,9 +3580,8 @@ static void gen6_sanitize_pm(struct drm_device *dev) limits |= (dev_priv->min_delay & 0x3f) << 16; if (old != limits) { - /* Note that the known failure case is to read back 0. */ - DRM_DEBUG_DRIVER("Power management discrepancy: GEN6_RP_INTERRUPT_LIMITS " - "expected %08x, was %08x\n", limits, old); + DRM_ERROR("Power management discrepancy: GEN6_RP_INTERRUPT_LIMITS expected %08x, was %08x\n", + limits, old); I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, limits); } diff --git a/trunk/drivers/gpu/drm/i915/intel_sdvo.c b/trunk/drivers/gpu/drm/i915/intel_sdvo.c index b6a9d45fc3c6..a949b73880c8 100644 --- a/trunk/drivers/gpu/drm/i915/intel_sdvo.c +++ b/trunk/drivers/gpu/drm/i915/intel_sdvo.c @@ -783,12 +783,10 @@ static void intel_sdvo_get_dtd_from_mode(struct intel_sdvo_dtd *dtd, ((v_sync_len & 0x30) >> 4); dtd->part2.dtd_flags = 0x18; - if (mode->flags & DRM_MODE_FLAG_INTERLACE) - dtd->part2.dtd_flags |= DTD_FLAG_INTERLACE; if (mode->flags & DRM_MODE_FLAG_PHSYNC) - dtd->part2.dtd_flags |= DTD_FLAG_HSYNC_POSITIVE; + dtd->part2.dtd_flags |= 0x2; if (mode->flags & DRM_MODE_FLAG_PVSYNC) - dtd->part2.dtd_flags |= DTD_FLAG_VSYNC_POSITIVE; + dtd->part2.dtd_flags |= 0x4; dtd->part2.sdvo_flags = 0; dtd->part2.v_sync_off_high = v_sync_offset & 0xc0; @@ -822,11 +820,9 @@ static void intel_sdvo_get_mode_from_dtd(struct drm_display_mode * mode, mode->clock = dtd->part1.clock * 10; mode->flags &= ~(DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC); - if (dtd->part2.dtd_flags & DTD_FLAG_INTERLACE) - mode->flags |= DRM_MODE_FLAG_INTERLACE; - if (dtd->part2.dtd_flags & DTD_FLAG_HSYNC_POSITIVE) + if (dtd->part2.dtd_flags & 0x2) mode->flags |= DRM_MODE_FLAG_PHSYNC; - if (dtd->part2.dtd_flags & DTD_FLAG_VSYNC_POSITIVE) + if (dtd->part2.dtd_flags & 0x4) mode->flags |= DRM_MODE_FLAG_PVSYNC; } diff --git a/trunk/drivers/gpu/drm/i915/intel_sdvo_regs.h b/trunk/drivers/gpu/drm/i915/intel_sdvo_regs.h index 9d030142ee43..6b7b22f4d63e 100644 --- a/trunk/drivers/gpu/drm/i915/intel_sdvo_regs.h +++ b/trunk/drivers/gpu/drm/i915/intel_sdvo_regs.h @@ -61,11 +61,6 @@ struct intel_sdvo_caps { u16 output_flags; } __attribute__((packed)); -/* Note: SDVO detailed timing flags match EDID misc flags. */ -#define DTD_FLAG_HSYNC_POSITIVE (1 << 1) -#define DTD_FLAG_VSYNC_POSITIVE (1 << 2) -#define DTD_FLAG_INTERLACE (1 << 7) - /** This matches the EDID DTD structure, more or less */ struct intel_sdvo_dtd { struct { diff --git a/trunk/drivers/gpu/drm/i915/intel_tv.c b/trunk/drivers/gpu/drm/i915/intel_tv.c index a233a51fd7e6..3346612d2953 100644 --- a/trunk/drivers/gpu/drm/i915/intel_tv.c +++ b/trunk/drivers/gpu/drm/i915/intel_tv.c @@ -673,54 +673,6 @@ static const struct tv_mode tv_modes[] = { .filter_table = filter_table, }, - { - .name = "480p", - .clock = 107520, - .refresh = 59940, - .oversample = TV_OVERSAMPLE_4X, - .component_only = 1, - - .hsync_end = 64, .hblank_end = 122, - .hblank_start = 842, .htotal = 857, - - .progressive = true, .trilevel_sync = false, - - .vsync_start_f1 = 12, .vsync_start_f2 = 12, - .vsync_len = 12, - - .veq_ena = false, - - .vi_end_f1 = 44, .vi_end_f2 = 44, - .nbr_end = 479, - - .burst_ena = false, - - .filter_table = filter_table, - }, - { - .name = "576p", - .clock = 107520, - .refresh = 50000, - .oversample = TV_OVERSAMPLE_4X, - .component_only = 1, - - .hsync_end = 64, .hblank_end = 139, - .hblank_start = 859, .htotal = 863, - - .progressive = true, .trilevel_sync = false, - - .vsync_start_f1 = 10, .vsync_start_f2 = 10, - .vsync_len = 10, - - .veq_ena = false, - - .vi_end_f1 = 48, .vi_end_f2 = 48, - .nbr_end = 575, - - .burst_ena = false, - - .filter_table = filter_table, - }, { .name = "720p@60Hz", .clock = 148800, @@ -1242,11 +1194,6 @@ intel_tv_detect_type(struct intel_tv *intel_tv, I915_WRITE(TV_DAC, save_tv_dac & ~TVDAC_STATE_CHG_EN); I915_WRITE(TV_CTL, save_tv_ctl); - POSTING_READ(TV_CTL); - - /* For unknown reasons the hw barfs if we don't do this vblank wait. */ - intel_wait_for_vblank(intel_tv->base.base.dev, - to_intel_crtc(intel_tv->base.base.crtc)->pipe); /* Restore interrupt config */ if (connector->polled & DRM_CONNECTOR_POLL_HPD) { diff --git a/trunk/drivers/gpu/drm/radeon/ni.c b/trunk/drivers/gpu/drm/radeon/ni.c index ce4e7cc6c905..b01c2dd627b0 100644 --- a/trunk/drivers/gpu/drm/radeon/ni.c +++ b/trunk/drivers/gpu/drm/radeon/ni.c @@ -865,7 +865,7 @@ static void cayman_gpu_init(struct radeon_device *rdev) /* num banks is 8 on all fusion asics. 0 = 4, 1 = 8, 2 = 16 */ if (rdev->flags & RADEON_IS_IGP) - rdev->config.cayman.tile_config |= 1 << 4; + rdev->config.evergreen.tile_config |= 1 << 4; else rdev->config.cayman.tile_config |= ((mc_arb_ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT) << 4; diff --git a/trunk/drivers/gpu/drm/radeon/radeon.h b/trunk/drivers/gpu/drm/radeon/radeon.h index 492654f8ee74..1dc3a4aba020 100644 --- a/trunk/drivers/gpu/drm/radeon/radeon.h +++ b/trunk/drivers/gpu/drm/radeon/radeon.h @@ -848,6 +848,7 @@ struct radeon_cs_parser { s32 priority; }; +extern int radeon_cs_update_pages(struct radeon_cs_parser *p, int pg_idx); extern int radeon_cs_finish_pages(struct radeon_cs_parser *p); extern u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx); diff --git a/trunk/drivers/gpu/drm/radeon/radeon_atombios.c b/trunk/drivers/gpu/drm/radeon/radeon_atombios.c index b1e3820df363..f6e69b8c06c6 100644 --- a/trunk/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/trunk/drivers/gpu/drm/radeon/radeon_atombios.c @@ -444,9 +444,7 @@ static bool radeon_atom_apply_quirks(struct drm_device *dev, */ if ((dev->pdev->device == 0x9498) && (dev->pdev->subsystem_vendor == 0x1682) && - (dev->pdev->subsystem_device == 0x2452) && - (i2c_bus->valid == false) && - !(supported_device & (ATOM_DEVICE_TV_SUPPORT | ATOM_DEVICE_CV_SUPPORT))) { + (dev->pdev->subsystem_device == 0x2452)) { struct radeon_device *rdev = dev->dev_private; *i2c_bus = radeon_lookup_i2c_gpio(rdev, 0x93); } diff --git a/trunk/drivers/gpu/drm/radeon/radeon_cs.c b/trunk/drivers/gpu/drm/radeon/radeon_cs.c index 0137689ed461..c7d64a739033 100644 --- a/trunk/drivers/gpu/drm/radeon/radeon_cs.c +++ b/trunk/drivers/gpu/drm/radeon/radeon_cs.c @@ -580,7 +580,7 @@ int radeon_cs_finish_pages(struct radeon_cs_parser *p) return 0; } -static int radeon_cs_update_pages(struct radeon_cs_parser *p, int pg_idx) +int radeon_cs_update_pages(struct radeon_cs_parser *p, int pg_idx) { int new_page; struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx]; @@ -623,28 +623,3 @@ static int radeon_cs_update_pages(struct radeon_cs_parser *p, int pg_idx) return new_page; } - -u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx) -{ - struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx]; - u32 pg_idx, pg_offset; - u32 idx_value = 0; - int new_page; - - pg_idx = (idx * 4) / PAGE_SIZE; - pg_offset = (idx * 4) % PAGE_SIZE; - - if (ibc->kpage_idx[0] == pg_idx) - return ibc->kpage[0][pg_offset/4]; - if (ibc->kpage_idx[1] == pg_idx) - return ibc->kpage[1][pg_offset/4]; - - new_page = radeon_cs_update_pages(p, pg_idx); - if (new_page < 0) { - p->parser_error = new_page; - return 0; - } - - idx_value = ibc->kpage[new_page][pg_offset/4]; - return idx_value; -} diff --git a/trunk/drivers/gpu/drm/radeon/radeon_ring.c b/trunk/drivers/gpu/drm/radeon/radeon_ring.c index 983658c91358..493a7be75306 100644 --- a/trunk/drivers/gpu/drm/radeon/radeon_ring.c +++ b/trunk/drivers/gpu/drm/radeon/radeon_ring.c @@ -39,6 +39,31 @@ */ int radeon_debugfs_sa_init(struct radeon_device *rdev); +u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx) +{ + struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx]; + u32 pg_idx, pg_offset; + u32 idx_value = 0; + int new_page; + + pg_idx = (idx * 4) / PAGE_SIZE; + pg_offset = (idx * 4) % PAGE_SIZE; + + if (ibc->kpage_idx[0] == pg_idx) + return ibc->kpage[0][pg_offset/4]; + if (ibc->kpage_idx[1] == pg_idx) + return ibc->kpage[1][pg_offset/4]; + + new_page = radeon_cs_update_pages(p, pg_idx); + if (new_page < 0) { + p->parser_error = new_page; + return 0; + } + + idx_value = ibc->kpage[new_page][pg_offset/4]; + return idx_value; +} + int radeon_ib_get(struct radeon_device *rdev, int ring, struct radeon_ib *ib, unsigned size) { diff --git a/trunk/drivers/gpu/drm/udl/udl_gem.c b/trunk/drivers/gpu/drm/udl/udl_gem.c index 97acc9c6c95b..40efd32f7dce 100644 --- a/trunk/drivers/gpu/drm/udl/udl_gem.c +++ b/trunk/drivers/gpu/drm/udl/udl_gem.c @@ -234,7 +234,7 @@ int udl_gem_mmap(struct drm_file *file, struct drm_device *dev, ret = udl_gem_get_pages(gobj, GFP_KERNEL); if (ret) - goto out; + return ret; if (!gobj->base.map_list.map) { ret = drm_gem_create_mmap_offset(obj); if (ret) @@ -257,6 +257,8 @@ static int udl_prime_create(struct drm_device *dev, { struct udl_gem_object *obj; int npages; + int i; + struct scatterlist *iter; npages = size / PAGE_SIZE; diff --git a/trunk/drivers/iommu/amd_iommu.c b/trunk/drivers/iommu/amd_iommu.c index d90a421e9cac..a5bee8e2dfce 100644 --- a/trunk/drivers/iommu/amd_iommu.c +++ b/trunk/drivers/iommu/amd_iommu.c @@ -450,27 +450,12 @@ static void dump_command(unsigned long phys_addr) static void iommu_print_event(struct amd_iommu *iommu, void *__evt) { - int type, devid, domid, flags; - volatile u32 *event = __evt; - int count = 0; - u64 address; - -retry: - type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; - devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; - domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; - flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; - address = (u64)(((u64)event[3]) << 32) | event[2]; - - if (type == 0) { - /* Did we hit the erratum? */ - if (++count == LOOP_TIMEOUT) { - pr_err("AMD-Vi: No event written to event log\n"); - return; - } - udelay(1); - goto retry; - } + u32 *event = __evt; + int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; + int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; + int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; + int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; + u64 address = (u64)(((u64)event[3]) << 32) | event[2]; printk(KERN_ERR "AMD-Vi: Event logged ["); @@ -523,8 +508,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) default: printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); } - - memset(__evt, 0, 4 * sizeof(u32)); } static void iommu_poll_events(struct amd_iommu *iommu) @@ -2052,20 +2035,20 @@ static int pdev_iommuv2_enable(struct pci_dev *pdev) } /* FIXME: Move this to PCI code */ -#define PCI_PRI_TLP_OFF (1 << 15) +#define PCI_PRI_TLP_OFF (1 << 2) bool pci_pri_tlp_required(struct pci_dev *pdev) { - u16 status; + u16 control; int pos; pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return false; - pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); - return (status & PCI_PRI_TLP_OFF) ? true : false; + return (control & PCI_PRI_TLP_OFF) ? true : false; } /* diff --git a/trunk/drivers/iommu/iommu.c b/trunk/drivers/iommu/iommu.c index 8b9ded88e6f5..2198b2dbbcd3 100644 --- a/trunk/drivers/iommu/iommu.c +++ b/trunk/drivers/iommu/iommu.c @@ -119,7 +119,6 @@ EXPORT_SYMBOL_GPL(iommu_present); * iommu_set_fault_handler() - set a fault handler for an iommu domain * @domain: iommu domain * @handler: fault handler - * @token: user data, will be passed back to the fault handler * * This function should be used by IOMMU users which want to be notified * whenever an IOMMU fault happens. @@ -128,13 +127,11 @@ EXPORT_SYMBOL_GPL(iommu_present); * error code otherwise. */ void iommu_set_fault_handler(struct iommu_domain *domain, - iommu_fault_handler_t handler, - void *token) + iommu_fault_handler_t handler) { BUG_ON(!domain); domain->handler = handler; - domain->handler_token = token; } EXPORT_SYMBOL_GPL(iommu_set_fault_handler); diff --git a/trunk/drivers/iommu/omap-iommu.c b/trunk/drivers/iommu/omap-iommu.c index e70ee2b59df9..6899dcd02dfa 100644 --- a/trunk/drivers/iommu/omap-iommu.c +++ b/trunk/drivers/iommu/omap-iommu.c @@ -41,13 +41,11 @@ * @pgtable: the page table * @iommu_dev: an omap iommu device attached to this domain. only a single * iommu device can be attached for now. - * @dev: Device using this domain. * @lock: domain lock, should be taken when attaching/detaching */ struct omap_iommu_domain { u32 *pgtable; struct omap_iommu *iommu_dev; - struct device *dev; spinlock_t lock; }; @@ -1083,7 +1081,6 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) } omap_domain->iommu_dev = arch_data->iommu_dev = oiommu; - omap_domain->dev = dev; oiommu->domain = domain; out: @@ -1091,16 +1088,19 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) return ret; } -static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, - struct device *dev) +static void omap_iommu_detach_dev(struct iommu_domain *domain, + struct device *dev) { - struct omap_iommu *oiommu = dev_to_omap_iommu(dev); + struct omap_iommu_domain *omap_domain = domain->priv; struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; + struct omap_iommu *oiommu = dev_to_omap_iommu(dev); + + spin_lock(&omap_domain->lock); /* only a single device is supported per domain for now */ if (omap_domain->iommu_dev != oiommu) { dev_err(dev, "invalid iommu device\n"); - return; + goto out; } iopgtable_clear_entry_all(oiommu); @@ -1108,16 +1108,8 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, omap_iommu_detach(oiommu); omap_domain->iommu_dev = arch_data->iommu_dev = NULL; - omap_domain->dev = NULL; -} -static void omap_iommu_detach_dev(struct iommu_domain *domain, - struct device *dev) -{ - struct omap_iommu_domain *omap_domain = domain->priv; - - spin_lock(&omap_domain->lock); - _omap_iommu_detach_dev(omap_domain, dev); +out: spin_unlock(&omap_domain->lock); } @@ -1156,19 +1148,13 @@ static int omap_iommu_domain_init(struct iommu_domain *domain) return -ENOMEM; } +/* assume device was already detached */ static void omap_iommu_domain_destroy(struct iommu_domain *domain) { struct omap_iommu_domain *omap_domain = domain->priv; domain->priv = NULL; - /* - * An iommu device is still attached - * (currently, only one device can be attached) ? - */ - if (omap_domain->iommu_dev) - _omap_iommu_detach_dev(omap_domain, omap_domain->dev); - kfree(omap_domain->pgtable); kfree(omap_domain); } diff --git a/trunk/drivers/iommu/tegra-gart.c b/trunk/drivers/iommu/tegra-gart.c index 0c0a37792218..779306ee7b16 100644 --- a/trunk/drivers/iommu/tegra-gart.c +++ b/trunk/drivers/iommu/tegra-gart.c @@ -29,17 +29,15 @@ #include #include #include -#include #include /* bitmap of the page sizes currently supported */ #define GART_IOMMU_PGSIZES (SZ_4K) -#define GART_REG_BASE 0x24 -#define GART_CONFIG (0x24 - GART_REG_BASE) -#define GART_ENTRY_ADDR (0x28 - GART_REG_BASE) -#define GART_ENTRY_DATA (0x2c - GART_REG_BASE) +#define GART_CONFIG 0x24 +#define GART_ENTRY_ADDR 0x28 +#define GART_ENTRY_DATA 0x2c #define GART_ENTRY_PHYS_ADDR_VALID (1 << 31) #define GART_PAGE_SHIFT 12 @@ -160,7 +158,7 @@ static int gart_iommu_attach_dev(struct iommu_domain *domain, struct gart_client *client, *c; int err = 0; - gart = gart_handle; + gart = dev_get_drvdata(dev->parent); if (!gart) return -EINVAL; domain->priv = gart; @@ -424,14 +422,6 @@ const struct dev_pm_ops tegra_gart_pm_ops = { .resume = tegra_gart_resume, }; -#ifdef CONFIG_OF -static struct of_device_id tegra_gart_of_match[] __devinitdata = { - { .compatible = "nvidia,tegra20-gart", }, - { }, -}; -MODULE_DEVICE_TABLE(of, tegra_gart_of_match); -#endif - static struct platform_driver tegra_gart_driver = { .probe = tegra_gart_probe, .remove = tegra_gart_remove, @@ -439,7 +429,6 @@ static struct platform_driver tegra_gart_driver = { .owner = THIS_MODULE, .name = "tegra-gart", .pm = &tegra_gart_pm_ops, - .of_match_table = of_match_ptr(tegra_gart_of_match), }, }; @@ -459,5 +448,4 @@ module_exit(tegra_gart_exit); MODULE_DESCRIPTION("IOMMU API for GART in Tegra20"); MODULE_AUTHOR("Hiroshi DOYU "); -MODULE_ALIAS("platform:tegra-gart"); MODULE_LICENSE("GPL v2"); diff --git a/trunk/drivers/iommu/tegra-smmu.c b/trunk/drivers/iommu/tegra-smmu.c index ecd679043d77..eb93c821f592 100644 --- a/trunk/drivers/iommu/tegra-smmu.c +++ b/trunk/drivers/iommu/tegra-smmu.c @@ -733,7 +733,7 @@ static int smmu_iommu_attach_dev(struct iommu_domain *domain, pr_info("Reserve \"page zero\" for AVP vectors using a common dummy\n"); } - dev_dbg(smmu->dev, "%s is attached\n", dev_name(dev)); + dev_dbg(smmu->dev, "%s is attached\n", dev_name(c->dev)); return 0; err_client: diff --git a/trunk/drivers/remoteproc/remoteproc_core.c b/trunk/drivers/remoteproc/remoteproc_core.c index 8ea7bccc7100..d6f8adaa26ef 100644 --- a/trunk/drivers/remoteproc/remoteproc_core.c +++ b/trunk/drivers/remoteproc/remoteproc_core.c @@ -78,7 +78,7 @@ typedef int (*rproc_handle_resource_t)(struct rproc *rproc, void *, int avail); * the recovery of the remote processor. */ static int rproc_iommu_fault(struct iommu_domain *domain, struct device *dev, - unsigned long iova, int flags, void *token) + unsigned long iova, int flags) { dev_err(dev, "iommu fault: da 0x%lx flags 0x%x\n", iova, flags); @@ -117,7 +117,7 @@ static int rproc_enable_iommu(struct rproc *rproc) return -ENOMEM; } - iommu_set_fault_handler(domain, rproc_iommu_fault, rproc); + iommu_set_fault_handler(domain, rproc_iommu_fault); ret = iommu_attach_device(domain, dev); if (ret) { diff --git a/trunk/drivers/watchdog/watchdog_dev.c b/trunk/drivers/watchdog/watchdog_dev.c index 8558da912c42..6c18a58cfd17 100644 --- a/trunk/drivers/watchdog/watchdog_dev.c +++ b/trunk/drivers/watchdog/watchdog_dev.c @@ -42,6 +42,8 @@ #include /* For __init/__exit/... */ #include /* For copy_to_user/put_user/... */ +#include "watchdog_dev.h" + /* make sure we only register one /dev/watchdog device */ static unsigned long watchdog_dev_busy; /* the watchdog device behind /dev/watchdog */ diff --git a/trunk/fs/bio.c b/trunk/fs/bio.c index 73922abba832..84da88539046 100644 --- a/trunk/fs/bio.c +++ b/trunk/fs/bio.c @@ -19,14 +19,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include /* for struct sg_iovec */ #include @@ -420,7 +418,6 @@ void bio_put(struct bio *bio) * last put frees it */ if (atomic_dec_and_test(&bio->bi_cnt)) { - bio_disassociate_task(bio); bio->bi_next = NULL; bio->bi_destructor(bio); } @@ -1649,64 +1646,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) } EXPORT_SYMBOL(bioset_create); -#ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_current - associate a bio with %current - * @bio: target bio - * - * Associate @bio with %current if it hasn't been associated yet. Block - * layer will treat @bio as if it were issued by %current no matter which - * task actually issues it. - * - * This function takes an extra reference of @task's io_context and blkcg - * which will be put when @bio is released. The caller must own @bio, - * ensure %current->io_context exists, and is responsible for synchronizing - * calls to this function. - */ -int bio_associate_current(struct bio *bio) -{ - struct io_context *ioc; - struct cgroup_subsys_state *css; - - if (bio->bi_ioc) - return -EBUSY; - - ioc = current->io_context; - if (!ioc) - return -ENOENT; - - /* acquire active ref on @ioc and associate */ - get_io_context_active(ioc); - bio->bi_ioc = ioc; - - /* associate blkcg if exists */ - rcu_read_lock(); - css = task_subsys_state(current, blkio_subsys_id); - if (css && css_tryget(css)) - bio->bi_css = css; - rcu_read_unlock(); - - return 0; -} - -/** - * bio_disassociate_task - undo bio_associate_current() - * @bio: target bio - */ -void bio_disassociate_task(struct bio *bio) -{ - if (bio->bi_ioc) { - put_io_context(bio->bi_ioc); - bio->bi_ioc = NULL; - } - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } -} - -#endif /* CONFIG_BLK_CGROUP */ - static void __init biovec_init_slabs(void) { int i; diff --git a/trunk/fs/ioprio.c b/trunk/fs/ioprio.c index e50170ca7c33..5e6dbe8958fc 100644 --- a/trunk/fs/ioprio.c +++ b/trunk/fs/ioprio.c @@ -50,7 +50,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { - ioc->ioprio = ioprio; + ioc_ioprio_changed(ioc, ioprio); put_io_context(ioc); } diff --git a/trunk/fs/splice.c b/trunk/fs/splice.c index 406ef2b792c2..f8476841eb04 100644 --- a/trunk/fs/splice.c +++ b/trunk/fs/splice.c @@ -1388,7 +1388,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, */ static int get_iovec_page_array(const struct iovec __user *iov, unsigned int nr_vecs, struct page **pages, - struct partial_page *partial, bool aligned, + struct partial_page *partial, int aligned, unsigned int pipe_buffers) { int buffers = 0, error = 0; @@ -1626,7 +1626,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, return -ENOMEM; spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, - spd.partial, false, + spd.partial, flags & SPLICE_F_GIFT, pipe->buffers); if (spd.nr_pages <= 0) ret = spd.nr_pages; diff --git a/trunk/include/linux/bio.h b/trunk/include/linux/bio.h index 26435890dc87..4d94eb8bcbcc 100644 --- a/trunk/include/linux/bio.h +++ b/trunk/include/linux/bio.h @@ -269,14 +269,6 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); -#ifdef CONFIG_BLK_CGROUP -int bio_associate_current(struct bio *bio); -void bio_disassociate_task(struct bio *bio); -#else /* CONFIG_BLK_CGROUP */ -static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } -static inline void bio_disassociate_task(struct bio *bio) { } -#endif /* CONFIG_BLK_CGROUP */ - /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. diff --git a/trunk/include/linux/blk_types.h b/trunk/include/linux/blk_types.h index 0edb65dd8edd..4053cbd4490e 100644 --- a/trunk/include/linux/blk_types.h +++ b/trunk/include/linux/blk_types.h @@ -14,8 +14,6 @@ struct bio; struct bio_integrity_payload; struct page; struct block_device; -struct io_context; -struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *, int); typedef void (bio_destructor_t) (struct bio *); @@ -68,14 +66,6 @@ struct bio { bio_end_io_t *bi_end_io; void *bi_private; -#ifdef CONFIG_BLK_CGROUP - /* - * Optional ioc and css associated with this bio. Put on bio - * release. Read comment on top of bio_associate_current(). - */ - struct io_context *bi_ioc; - struct cgroup_subsys_state *bi_css; -#endif #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif diff --git a/trunk/include/linux/blkdev.h b/trunk/include/linux/blkdev.h index ba43f408baa3..4d4ac24a263e 100644 --- a/trunk/include/linux/blkdev.h +++ b/trunk/include/linux/blkdev.h @@ -32,17 +32,10 @@ struct blk_trace; struct request; struct sg_io_hdr; struct bsg_job; -struct blkcg_gq; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ -/* - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ -#define BLKCG_MAX_POLS 2 - struct request; typedef void (rq_end_io_fn)(struct request *, int); @@ -370,11 +363,6 @@ struct request_queue { struct list_head timeout_list; struct list_head icq_list; -#ifdef CONFIG_BLK_CGROUP - DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); - struct blkcg_gq *root_blkg; - struct list_head blkg_list; -#endif struct queue_limits limits; @@ -402,17 +390,12 @@ struct request_queue { struct mutex sysfs_lock; - int bypass_depth; - #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; int bsg_job_size; struct bsg_class_device bsg_dev; #endif -#ifdef CONFIG_BLK_CGROUP - struct list_head all_q_node; -#endif #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; @@ -424,7 +407,7 @@ struct request_queue { #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ #define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ -#define QUEUE_FLAG_BYPASS 6 /* act as dumb FIFO queue */ +#define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ @@ -508,7 +491,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) -#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) diff --git a/trunk/include/linux/drbd.h b/trunk/include/linux/drbd.h index 47e3d4850584..9e5f5607eba3 100644 --- a/trunk/include/linux/drbd.h +++ b/trunk/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.13" +#define REL_VERSION "8.3.11" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 96 @@ -112,8 +112,8 @@ enum drbd_ret_code { ERR_OPEN_MD_DISK = 105, ERR_DISK_NOT_BDEV = 107, ERR_MD_NOT_BDEV = 108, - ERR_DISK_TOO_SMALL = 111, - ERR_MD_DISK_TOO_SMALL = 112, + ERR_DISK_TO_SMALL = 111, + ERR_MD_DISK_TO_SMALL = 112, ERR_BDCLAIM_DISK = 114, ERR_BDCLAIM_MD_DISK = 115, ERR_MD_IDX_INVALID = 116, diff --git a/trunk/include/linux/drbd_limits.h b/trunk/include/linux/drbd_limits.h index fb670bf603f7..447c36752385 100644 --- a/trunk/include/linux/drbd_limits.h +++ b/trunk/include/linux/drbd_limits.h @@ -48,11 +48,6 @@ #define DRBD_TIMEOUT_MAX 600 #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ - /* If backing disk takes longer than disk_timeout, mark the disk as failed */ -#define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ -#define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ -#define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ - /* active connection retries when C_WF_CONNECTION */ #define DRBD_CONNECT_INT_MIN 1 #define DRBD_CONNECT_INT_MAX 120 @@ -65,7 +60,7 @@ /* timeout for the ping packets.*/ #define DRBD_PING_TIMEO_MIN 1 -#define DRBD_PING_TIMEO_MAX 300 +#define DRBD_PING_TIMEO_MAX 100 #define DRBD_PING_TIMEO_DEF 5 /* max number of write requests between write barriers */ diff --git a/trunk/include/linux/drbd_nl.h b/trunk/include/linux/drbd_nl.h index a8706f08ab36..ab6159e4fcf0 100644 --- a/trunk/include/linux/drbd_nl.h +++ b/trunk/include/linux/drbd_nl.h @@ -31,12 +31,9 @@ NL_PACKET(disk_conf, 3, NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) - NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout) ) -NL_PACKET(detach, 4, - NL_BIT( 88, T_MANDATORY, detach_force) -) +NL_PACKET(detach, 4, ) NL_PACKET(net_conf, 5, NL_STRING( 8, T_MANDATORY, my_addr, 128) diff --git a/trunk/include/linux/elevator.h b/trunk/include/linux/elevator.h index c03af7687bb4..7d4e0356f329 100644 --- a/trunk/include/linux/elevator.h +++ b/trunk/include/linux/elevator.h @@ -28,13 +28,12 @@ typedef int (elevator_may_queue_fn) (struct request_queue *, int); typedef void (elevator_init_icq_fn) (struct io_cq *); typedef void (elevator_exit_icq_fn) (struct io_cq *); -typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, - struct bio *, gfp_t); +typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t); typedef void (elevator_put_req_fn) (struct request *); typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); -typedef int (elevator_init_fn) (struct request_queue *); +typedef void *(elevator_init_fn) (struct request_queue *); typedef void (elevator_exit_fn) (struct elevator_queue *); struct elevator_ops @@ -130,8 +129,7 @@ extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, int); extern void elv_abort_queue(struct request_queue *); extern void elv_completed_request(struct request_queue *, struct request *); -extern int elv_set_request(struct request_queue *q, struct request *rq, - struct bio *bio, gfp_t gfp_mask); +extern int elv_set_request(struct request_queue *, struct request *, gfp_t); extern void elv_put_request(struct request_queue *, struct request *); extern void elv_drain_elevator(struct request_queue *); diff --git a/trunk/include/linux/iocontext.h b/trunk/include/linux/iocontext.h index df38db2ef45b..1a3018063034 100644 --- a/trunk/include/linux/iocontext.h +++ b/trunk/include/linux/iocontext.h @@ -6,7 +6,11 @@ #include enum { + ICQ_IOPRIO_CHANGED = 1 << 0, + ICQ_CGROUP_CHANGED = 1 << 1, ICQ_EXITED = 1 << 2, + + ICQ_CHANGED_MASK = ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED, }; /* @@ -96,7 +100,6 @@ struct io_cq { */ struct io_context { atomic_long_t refcount; - atomic_t active_ref; atomic_t nr_tasks; /* all the fields below are protected by this lock */ @@ -117,37 +120,29 @@ struct io_context { struct work_struct release_work; }; -/** - * get_io_context_active - get active reference on ioc - * @ioc: ioc of interest - * - * Only iocs with active reference can issue new IOs. This function - * acquires an active reference on @ioc. The caller must already have an - * active reference on @ioc. - */ -static inline void get_io_context_active(struct io_context *ioc) -{ - WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0); - WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0); - atomic_long_inc(&ioc->refcount); - atomic_inc(&ioc->active_ref); -} - -static inline void ioc_task_link(struct io_context *ioc) +static inline struct io_context *ioc_task_link(struct io_context *ioc) { - get_io_context_active(ioc); + /* + * if ref count is zero, don't allow sharing (ioc is going away, it's + * a race). + */ + if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) { + atomic_inc(&ioc->nr_tasks); + return ioc; + } - WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); - atomic_inc(&ioc->nr_tasks); + return NULL; } struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); -void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); +void ioc_ioprio_changed(struct io_context *ioc, int ioprio); +void ioc_cgroup_changed(struct io_context *ioc); +unsigned int icq_get_changed(struct io_cq *icq); #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } diff --git a/trunk/include/linux/iommu.h b/trunk/include/linux/iommu.h index 450293f6d68b..d937580417ba 100644 --- a/trunk/include/linux/iommu.h +++ b/trunk/include/linux/iommu.h @@ -35,13 +35,12 @@ struct iommu_domain; #define IOMMU_FAULT_WRITE 0x1 typedef int (*iommu_fault_handler_t)(struct iommu_domain *, - struct device *, unsigned long, int, void *); + struct device *, unsigned long, int); struct iommu_domain { struct iommu_ops *ops; void *priv; iommu_fault_handler_t handler; - void *handler_token; }; #define IOMMU_CAP_CACHE_COHERENCY 0x1 @@ -96,7 +95,7 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, extern int iommu_domain_has_cap(struct iommu_domain *domain, unsigned long cap); extern void iommu_set_fault_handler(struct iommu_domain *domain, - iommu_fault_handler_t handler, void *token); + iommu_fault_handler_t handler); extern int iommu_device_group(struct device *dev, unsigned int *groupid); /** @@ -133,8 +132,7 @@ static inline int report_iommu_fault(struct iommu_domain *domain, * invoke it. */ if (domain->handler) - ret = domain->handler(domain, dev, iova, flags, - domain->handler_token); + ret = domain->handler(domain, dev, iova, flags); return ret; } @@ -193,7 +191,7 @@ static inline int domain_has_cap(struct iommu_domain *domain, } static inline void iommu_set_fault_handler(struct iommu_domain *domain, - iommu_fault_handler_t handler, void *token) + iommu_fault_handler_t handler) { } diff --git a/trunk/include/linux/ioprio.h b/trunk/include/linux/ioprio.h index beb9ce1c2c23..76dad4808847 100644 --- a/trunk/include/linux/ioprio.h +++ b/trunk/include/linux/ioprio.h @@ -41,15 +41,27 @@ enum { IOPRIO_WHO_USER, }; -/* - * Fallback BE priority - */ -#define IOPRIO_NORM (4) - /* * if process has set io priority explicitly, use that. if not, convert * the cpu scheduler nice value to an io priority */ +#define IOPRIO_NORM (4) +static inline int task_ioprio(struct io_context *ioc) +{ + if (ioprio_valid(ioc->ioprio)) + return IOPRIO_PRIO_DATA(ioc->ioprio); + + return IOPRIO_NORM; +} + +static inline int task_ioprio_class(struct io_context *ioc) +{ + if (ioprio_valid(ioc->ioprio)) + return IOPRIO_PRIO_CLASS(ioc->ioprio); + + return IOPRIO_CLASS_BE; +} + static inline int task_nice_ioprio(struct task_struct *task) { return (task_nice(task) + 20) / 5; diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig index 1e004d057468..81816b82860b 100644 --- a/trunk/init/Kconfig +++ b/trunk/init/Kconfig @@ -803,7 +803,7 @@ config RT_GROUP_SCHED endif #CGROUP_SCHED config BLK_CGROUP - bool "Block IO controller" + tristate "Block IO controller" depends on BLOCK default n ---help--- diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index 31a32c7dd169..017fb23d5983 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -976,8 +976,9 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { - ioc_task_link(ioc); - tsk->io_context = ioc; + tsk->io_context = ioc_task_link(ioc); + if (unlikely(!tsk->io_context)) + return -ENOMEM; } else if (ioprio_valid(ioc->ioprio)) { new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); if (unlikely(!new_ioc)) diff --git a/trunk/lib/dma-debug.c b/trunk/lib/dma-debug.c index 518aea714d21..13ef2338be41 100644 --- a/trunk/lib/dma-debug.c +++ b/trunk/lib/dma-debug.c @@ -430,7 +430,7 @@ static struct dma_debug_entry *__dma_entry_alloc(void) */ static struct dma_debug_entry *dma_entry_alloc(void) { - struct dma_debug_entry *entry; + struct dma_debug_entry *entry = NULL; unsigned long flags; spin_lock_irqsave(&free_entries_lock, flags); @@ -438,14 +438,11 @@ static struct dma_debug_entry *dma_entry_alloc(void) if (list_empty(&free_entries)) { pr_err("DMA-API: debugging out of memory - disabling\n"); global_disable = true; - spin_unlock_irqrestore(&free_entries_lock, flags); - return NULL; + goto out; } entry = __dma_entry_alloc(); - spin_unlock_irqrestore(&free_entries_lock, flags); - #ifdef CONFIG_STACKTRACE entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; entry->stacktrace.entries = entry->st_entries; @@ -453,6 +450,9 @@ static struct dma_debug_entry *dma_entry_alloc(void) save_stack_trace(&entry->stacktrace); #endif +out: + spin_unlock_irqrestore(&free_entries_lock, flags); + return entry; } diff --git a/trunk/mm/hugetlb.c b/trunk/mm/hugetlb.c index e198831276a3..285a81e87ec8 100644 --- a/trunk/mm/hugetlb.c +++ b/trunk/mm/hugetlb.c @@ -3036,8 +3036,7 @@ int hugetlb_reserve_pages(struct inode *inode, region_add(&inode->i_mapping->private_list, from, to); return 0; out_err: - if (vma) - resv_map_put(vma); + resv_map_put(vma); return ret; }