From 0c80d63b83db4c0b8d680cd7e33f1d8f86dc5b7d Mon Sep 17 00:00:00 2001
From: Ivo van Doorn <ivdoorn@gmail.com>
Date: Fri, 11 May 2007 15:59:40 -0400
Subject: [PATCH] --- yaml --- r: 58337 b: refs/heads/master c:
 9467d64b0e88763914c01f71ddf591b166c4f526 h: refs/heads/master i:   58335:
 f5a0524658e9761d2119de07e91198af66b75ef1 v: v3

---
 [refs]                                        |    2 +-
 trunk/Documentation/kernel-parameters.txt     |   43 +
 trunk/Documentation/sched-design-CFS.txt      |  119 -
 trunk/arch/i386/kernel/smpboot.c              |   12 +
 trunk/arch/i386/kernel/tsc.c                  |    9 +-
 trunk/arch/ia64/kernel/setup.c                |    6 +
 trunk/arch/mips/kernel/smp.c                  |   11 +
 trunk/arch/sparc/kernel/smp.c                 |   10 +
 trunk/arch/sparc64/kernel/smp.c               |   27 +
 trunk/drivers/ide/arm/icside.c                |   16 +-
 trunk/drivers/ide/cris/ide-cris.c             |    2 +-
 trunk/drivers/ide/ide-cd.c                    |    6 +-
 trunk/drivers/ide/ide-cd.h                    |    2 +
 trunk/drivers/ide/ide-disk.c                  |    8 +-
 trunk/drivers/ide/ide-dma.c                   |  110 +-
 trunk/drivers/ide/ide-io.c                    |    4 +-
 trunk/drivers/ide/ide-iops.c                  |    8 +-
 trunk/drivers/ide/ide-probe.c                 |   10 +-
 trunk/drivers/ide/ide-proc.c                  |   34 +-
 trunk/drivers/ide/ide-timing.h                |   56 +-
 trunk/drivers/ide/ide.c                       |   33 +-
 trunk/drivers/ide/legacy/hd.c                 |    2 +-
 trunk/drivers/ide/legacy/macide.c             |   14 +
 trunk/drivers/ide/mips/au1xxx-ide.c           |   24 +-
 trunk/drivers/ide/pci/aec62xx.c               |  119 +-
 trunk/drivers/ide/pci/alim15x3.c              |   78 +-
 trunk/drivers/ide/pci/amd74xx.c               |  127 +-
 trunk/drivers/ide/pci/atiixp.c                |    5 +-
 trunk/drivers/ide/pci/cmd64x.c                |  130 +-
 trunk/drivers/ide/pci/cs5535.c                |    6 +-
 trunk/drivers/ide/pci/hpt366.c                |  170 +-
 trunk/drivers/ide/pci/it8213.c                |    8 +-
 trunk/drivers/ide/pci/it821x.c                |    9 +-
 trunk/drivers/ide/pci/jmicron.c               |   20 +-
 trunk/drivers/ide/pci/pdc202xx_new.c          |    9 +-
 trunk/drivers/ide/pci/pdc202xx_old.c          |   35 +-
 trunk/drivers/ide/pci/piix.c                  |   45 +-
 trunk/drivers/ide/pci/scc_pata.c              |    2 +-
 trunk/drivers/ide/pci/serverworks.c           |  103 +-
 trunk/drivers/ide/pci/sgiioc4.c               |   20 +-
 trunk/drivers/ide/pci/siimage.c               |   18 +-
 trunk/drivers/ide/pci/sis5513.c               |   34 +-
 trunk/drivers/ide/pci/sl82c105.c              |   20 +-
 trunk/drivers/ide/pci/slc90e66.c              |    5 +-
 trunk/drivers/ide/pci/tc86c001.c              |    4 +-
 trunk/drivers/ide/pci/via82cxxx.c             |  175 +-
 trunk/drivers/ide/ppc/pmac.c                  |   42 +-
 trunk/drivers/misc/Kconfig                    |    6 +-
 trunk/drivers/misc/Makefile                   |    1 +
 trunk/drivers/misc/eeprom_93cx6.c             |  229 ++
 trunk/fs/jfs/endian24.h                       |    2 +-
 trunk/fs/jfs/jfs_debug.c                      |   28 +
 trunk/fs/jfs/jfs_debug.h                      |    2 +
 trunk/fs/jfs/jfs_dinode.h                     |   42 +-
 trunk/fs/jfs/jfs_dmap.c                       |  419 +--
 trunk/fs/jfs/jfs_dmap.h                       |  118 +-
 trunk/fs/jfs/jfs_dtree.c                      |  105 +-
 trunk/fs/jfs/jfs_dtree.h                      |    2 +-
 trunk/fs/jfs/jfs_extent.c                     |  102 +-
 trunk/fs/jfs/jfs_filsys.h                     |   13 +-
 trunk/fs/jfs/jfs_imap.c                       |  296 +-
 trunk/fs/jfs/jfs_imap.h                       |   98 +-
 trunk/fs/jfs/jfs_incore.h                     |    4 +-
 trunk/fs/jfs/jfs_logmgr.c                     |   90 +-
 trunk/fs/jfs/jfs_logmgr.h                     |   26 +-
 trunk/fs/jfs/jfs_metapage.c                   |    3 +-
 trunk/fs/jfs/jfs_mount.c                      |    6 +-
 trunk/fs/jfs/jfs_txnmgr.c                     |  302 +-
 trunk/fs/jfs/jfs_txnmgr.h                     |    2 +-
 trunk/fs/jfs/jfs_types.h                      |   20 +-
 trunk/fs/jfs/jfs_umount.c                     |    2 +-
 trunk/fs/jfs/jfs_xtree.c                      |  428 +--
 trunk/fs/jfs/jfs_xtree.h                      |   48 +-
 trunk/fs/jfs/namei.c                          |   26 +-
 trunk/fs/jfs/resize.c                         |   48 +-
 trunk/fs/jfs/xattr.c                          |    9 +-
 trunk/fs/proc/array.c                         |   59 +-
 trunk/fs/proc/base.c                          |   71 +-
 trunk/include/asm-generic/bitops/sched.h      |   21 +-
 .../include/asm-mips/mach-au1x00/au1xxx_ide.h |   28 +-
 trunk/include/linux/eeprom_93cx6.h            |   72 +
 trunk/include/linux/hardirq.h                 |   13 -
 trunk/include/linux/ide.h                     |   18 +-
 trunk/include/linux/sched.h                   |  251 +-
 trunk/include/linux/topology.h                |   12 +-
 trunk/include/linux/wait.h                    |   16 +-
 trunk/init/main.c                             |    5 +-
 trunk/kernel/delayacct.c                      |   10 +-
 trunk/kernel/exit.c                           |    5 +-
 trunk/kernel/fork.c                           |    4 +-
 trunk/kernel/posix-cpu-timers.c               |   34 +-
 trunk/kernel/sched.c                          | 3023 ++++++++++-------
 trunk/kernel/sched_debug.c                    |  275 --
 trunk/kernel/sched_fair.c                     | 1131 ------
 trunk/kernel/sched_idletask.c                 |   71 -
 trunk/kernel/sched_rt.c                       |  255 --
 trunk/kernel/sched_stats.h                    |  235 --
 trunk/kernel/softirq.c                        |    1 +
 trunk/kernel/sysctl.c                         |   80 -
 trunk/lib/Kconfig.debug                       |    9 -
 100 files changed, 4407 insertions(+), 5521 deletions(-)
 delete mode 100644 trunk/Documentation/sched-design-CFS.txt
 create mode 100644 trunk/drivers/misc/eeprom_93cx6.c
 create mode 100644 trunk/include/linux/eeprom_93cx6.h
 delete mode 100644 trunk/kernel/sched_debug.c
 delete mode 100644 trunk/kernel/sched_fair.c
 delete mode 100644 trunk/kernel/sched_idletask.c
 delete mode 100644 trunk/kernel/sched_rt.c
 delete mode 100644 trunk/kernel/sched_stats.h

diff --git a/[refs] b/[refs]
index cd4f9270cd1a..6d1239337479 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 27a278aa4309df244a2619f47031acce00ca1b7c
+refs/heads/master: 9467d64b0e88763914c01f71ddf591b166c4f526
diff --git a/trunk/Documentation/kernel-parameters.txt b/trunk/Documentation/kernel-parameters.txt
index 4d880b3d1f35..af50f9bbe68e 100644
--- a/trunk/Documentation/kernel-parameters.txt
+++ b/trunk/Documentation/kernel-parameters.txt
@@ -1014,6 +1014,49 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	mga=		[HW,DRM]
 
+	migration_cost=
+			[KNL,SMP] debug: override scheduler migration costs
+			Format: <level-1-usecs>,<level-2-usecs>,...
+			This debugging option can be used to override the
+			default scheduler migration cost matrix. The numbers
+			are indexed by 'CPU domain distance'.
+			E.g. migration_cost=1000,2000,3000 on an SMT NUMA
+			box will set up an intra-core migration cost of
+			1 msec, an inter-core migration cost of 2 msecs,
+			and an inter-node migration cost of 3 msecs.
+
+			WARNING: using the wrong values here can break
+			scheduler performance, so it's only for scheduler
+			development purposes, not production environments.
+
+	migration_debug=
+			[KNL,SMP] migration cost auto-detect verbosity
+			Format=<0|1|2>
+			If a system's migration matrix reported at bootup
+			seems erroneous then this option can be used to
+			increase verbosity of the detection process.
+			We default to 0 (no extra messages), 1 will print
+			some more information, and 2 will be really
+			verbose (probably only useful if you also have a
+			serial console attached to the system).
+
+	migration_factor=
+			[KNL,SMP] multiply/divide migration costs by a factor
+			Format=<percent>
+			This debug option can be used to proportionally
+			increase or decrease the auto-detected migration
+			costs for all entries of the migration matrix.
+			E.g. migration_factor=150 will increase migration
+			costs by 50%. (and thus the scheduler will be less
+			eager migrating cache-hot tasks)
+			migration_factor=80 will decrease migration costs
+			by 20%. (thus the scheduler will be more eager to
+			migrate tasks)
+
+			WARNING: using the wrong values here can break
+			scheduler performance, so it's only for scheduler
+			development purposes, not production environments.
+
 	mousedev.tap_time=
 			[MOUSE] Maximum time between finger touching and
 			leaving touchpad surface for touch to be considered
diff --git a/trunk/Documentation/sched-design-CFS.txt b/trunk/Documentation/sched-design-CFS.txt
deleted file mode 100644
index 16feebb7bdc0..000000000000
--- a/trunk/Documentation/sched-design-CFS.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-
-This is the CFS scheduler.
-
-80% of CFS's design can be summed up in a single sentence: CFS basically
-models an "ideal, precise multi-tasking CPU" on real hardware.
-
-"Ideal multi-tasking CPU" is a (non-existent  :-))  CPU that has 100%
-physical power and which can run each task at precise equal speed, in
-parallel, each at 1/nr_running speed. For example: if there are 2 tasks
-running then it runs each at 50% physical power - totally in parallel.
-
-On real hardware, we can run only a single task at once, so while that
-one task runs, the other tasks that are waiting for the CPU are at a
-disadvantage - the current task gets an unfair amount of CPU time. In
-CFS this fairness imbalance is expressed and tracked via the per-task
-p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
-time the task should now run on the CPU for it to become completely fair
-and balanced.
-
-( small detail: on 'ideal' hardware, the p->wait_runtime value would
-  always be zero - no task would ever get 'out of balance' from the
-  'ideal' share of CPU time. )
-
-CFS's task picking logic is based on this p->wait_runtime value and it
-is thus very simple: it always tries to run the task with the largest
-p->wait_runtime value. In other words, CFS tries to run the task with
-the 'gravest need' for more CPU time. So CFS always tries to split up
-CPU time between runnable tasks as close to 'ideal multitasking
-hardware' as possible.
-
-Most of the rest of CFS's design just falls out of this really simple
-concept, with a few add-on embellishments like nice levels,
-multiprocessing and various algorithm variants to recognize sleepers.
-
-In practice it works like this: the system runs a task a bit, and when
-the task schedules (or a scheduler tick happens) the task's CPU usage is
-'accounted for': the (small) time it just spent using the physical CPU
-is deducted from p->wait_runtime. [minus the 'fair share' it would have
-gotten anyway]. Once p->wait_runtime gets low enough so that another
-task becomes the 'leftmost task' of the time-ordered rbtree it maintains
-(plus a small amount of 'granularity' distance relative to the leftmost
-task so that we do not over-schedule tasks and trash the cache) then the
-new leftmost task is picked and the current task is preempted.
-
-The rq->fair_clock value tracks the 'CPU time a runnable task would have
-fairly gotten, had it been runnable during that time'. So by using
-rq->fair_clock values we can accurately timestamp and measure the
-'expected CPU time' a task should have gotten. All runnable tasks are
-sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
-CFS picks the 'leftmost' task and sticks to it. As the system progresses
-forwards, newly woken tasks are put into the tree more and more to the
-right - slowly but surely giving a chance for every task to become the
-'leftmost task' and thus get on the CPU within a deterministic amount of
-time.
-
-Some implementation details:
-
- - the introduction of Scheduling Classes: an extensible hierarchy of
-   scheduler modules. These modules encapsulate scheduling policy
-   details and are handled by the scheduler core without the core
-   code assuming about them too much.
-
- - sched_fair.c implements the 'CFS desktop scheduler': it is a
-   replacement for the vanilla scheduler's SCHED_OTHER interactivity
-   code.
-
-   I'd like to give credit to Con Kolivas for the general approach here:
-   he has proven via RSDL/SD that 'fair scheduling' is possible and that
-   it results in better desktop scheduling. Kudos Con!
-
-   The CFS patch uses a completely different approach and implementation
-   from RSDL/SD. My goal was to make CFS's interactivity quality exceed
-   that of RSDL/SD, which is a high standard to meet :-) Testing
-   feedback is welcome to decide this one way or another. [ and, in any
-   case, all of SD's logic could be added via a kernel/sched_sd.c module
-   as well, if Con is interested in such an approach. ]
-
-   CFS's design is quite radical: it does not use runqueues, it uses a
-   time-ordered rbtree to build a 'timeline' of future task execution,
-   and thus has no 'array switch' artifacts (by which both the vanilla
-   scheduler and RSDL/SD are affected).
-
-   CFS uses nanosecond granularity accounting and does not rely on any
-   jiffies or other HZ detail. Thus the CFS scheduler has no notion of
-   'timeslices' and has no heuristics whatsoever. There is only one
-   central tunable:
-
-         /proc/sys/kernel/sched_granularity_ns
-
-   which can be used to tune the scheduler from 'desktop' (low
-   latencies) to 'server' (good batching) workloads. It defaults to a
-   setting suitable for desktop workloads. SCHED_BATCH is handled by the
-   CFS scheduler module too.
-
-   Due to its design, the CFS scheduler is not prone to any of the
-   'attacks' that exist today against the heuristics of the stock
-   scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
-   work fine and do not impact interactivity and produce the expected
-   behavior.
-
-   the CFS scheduler has a much stronger handling of nice levels and
-   SCHED_BATCH: both types of workloads should be isolated much more
-   agressively than under the vanilla scheduler.
-
-   ( another detail: due to nanosec accounting and timeline sorting,
-     sched_yield() support is very simple under CFS, and in fact under
-     CFS sched_yield() behaves much better than under any other
-     scheduler i have tested so far. )
-
- - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
-   way than the vanilla scheduler does. It uses 100 runqueues (for all
-   100 RT priority levels, instead of 140 in the vanilla scheduler)
-   and it needs no expired array.
-
- - reworked/sanitized SMP load-balancing: the runqueue-walking
-   assumptions are gone from the load-balancing code now, and
-   iterators of the scheduling modules are used. The balancing code got
-   quite a bit simpler as a result.
-
diff --git a/trunk/arch/i386/kernel/smpboot.c b/trunk/arch/i386/kernel/smpboot.c
index 0b2954534b8e..88baed1e7e83 100644
--- a/trunk/arch/i386/kernel/smpboot.c
+++ b/trunk/arch/i386/kernel/smpboot.c
@@ -941,6 +941,17 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
 }
 #endif
 
+static void smp_tune_scheduling(void)
+{
+	if (cpu_khz) {
+		/* cache size in kB */
+		long cachesize = boot_cpu_data.x86_cache_size;
+
+		if (cachesize > 0)
+			max_cache_size = cachesize * 1024;
+	}
+}
+
 /*
  * Cycle through the processors sending APIC IPIs to boot each.
  */
@@ -969,6 +980,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 	x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
 
 	current_thread_info()->cpu = 0;
+	smp_tune_scheduling();
 
 	set_cpu_sibling_map(0);
 
diff --git a/trunk/arch/i386/kernel/tsc.c b/trunk/arch/i386/kernel/tsc.c
index ea63a30ca3e8..f64b81f3033b 100644
--- a/trunk/arch/i386/kernel/tsc.c
+++ b/trunk/arch/i386/kernel/tsc.c
@@ -4,7 +4,6 @@
  * See comments there for proper credits.
  */
 
-#include <linux/sched.h>
 #include <linux/clocksource.h>
 #include <linux/workqueue.h>
 #include <linux/cpufreq.h>
@@ -107,13 +106,8 @@ unsigned long long sched_clock(void)
 
 	/*
 	 * Fall back to jiffies if there's no TSC available:
-	 * ( But note that we still use it if the TSC is marked
-	 *   unstable. We do this because unlike Time Of Day,
-	 *   the scheduler clock tolerates small errors and it's
-	 *   very important for it to be as fast as the platform
-	 *   can achive it. )
 	 */
-	if (unlikely(!tsc_enabled && !tsc_unstable))
+	if (unlikely(!tsc_enabled))
 		/* No locking but a rare wrong value is not a big deal: */
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 
@@ -283,7 +277,6 @@ static struct clocksource clocksource_tsc = {
 
 void mark_tsc_unstable(char *reason)
 {
-	sched_clock_unstable_event();
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
 		tsc_enabled = 0;
diff --git a/trunk/arch/ia64/kernel/setup.c b/trunk/arch/ia64/kernel/setup.c
index 188fb73c6845..eaa6a24bc0b6 100644
--- a/trunk/arch/ia64/kernel/setup.c
+++ b/trunk/arch/ia64/kernel/setup.c
@@ -805,6 +805,7 @@ static void __cpuinit
 get_max_cacheline_size (void)
 {
 	unsigned long line_size, max = 1;
+	unsigned int cache_size = 0;
 	u64 l, levels, unique_caches;
         pal_cache_config_info_t cci;
         s64 status;
@@ -834,6 +835,8 @@ get_max_cacheline_size (void)
 		line_size = 1 << cci.pcci_line_size;
 		if (line_size > max)
 			max = line_size;
+		if (cache_size < cci.pcci_cache_size)
+			cache_size = cci.pcci_cache_size;
 		if (!cci.pcci_unified) {
 			status = ia64_pal_cache_config_info(l,
 						    /* cache_type (instruction)= */ 1,
@@ -850,6 +853,9 @@ get_max_cacheline_size (void)
 			ia64_i_cache_stride_shift = cci.pcci_stride;
 	}
   out:
+#ifdef CONFIG_SMP
+	max_cache_size = max(max_cache_size, cache_size);
+#endif
 	if (max > ia64_max_cacheline_size)
 		ia64_max_cacheline_size = max;
 }
diff --git a/trunk/arch/mips/kernel/smp.c b/trunk/arch/mips/kernel/smp.c
index a1b017f2dbb3..67edfa7ed93a 100644
--- a/trunk/arch/mips/kernel/smp.c
+++ b/trunk/arch/mips/kernel/smp.c
@@ -51,6 +51,16 @@ int __cpu_logical_map[NR_CPUS];		/* Map logical to physical */
 EXPORT_SYMBOL(phys_cpu_present_map);
 EXPORT_SYMBOL(cpu_online_map);
 
+/* This happens early in bootup, can't really do it better */
+static void smp_tune_scheduling (void)
+{
+	struct cache_desc *cd = &current_cpu_data.scache;
+	unsigned long cachesize = cd->linesz * cd->sets * cd->ways;
+
+	if (cachesize > max_cache_size)
+		max_cache_size = cachesize;
+}
+
 extern void __init calibrate_delay(void);
 extern ATTRIB_NORET void cpu_idle(void);
 
@@ -218,6 +228,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	init_new_context(current, &init_mm);
 	current_thread_info()->cpu = 0;
+	smp_tune_scheduling();
 	plat_prepare_cpus(max_cpus);
 #ifndef CONFIG_HOTPLUG_CPU
 	cpu_present_map = cpu_possible_map;
diff --git a/trunk/arch/sparc/kernel/smp.c b/trunk/arch/sparc/kernel/smp.c
index 4fea3ac7bff0..4d9ad59031bb 100644
--- a/trunk/arch/sparc/kernel/smp.c
+++ b/trunk/arch/sparc/kernel/smp.c
@@ -68,6 +68,16 @@ void __cpuinit smp_store_cpu_info(int id)
 	cpu_data(id).prom_node = cpu_node;
 	cpu_data(id).mid = cpu_get_hwmid(cpu_node);
 
+	/* this is required to tune the scheduler correctly */
+	/* is it possible to have CPUs with different cache sizes? */
+	if (id == boot_cpu_id) {
+		int cache_line,cache_nlines;
+		cache_line = 0x20;
+		cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
+		cache_nlines = 0x8000;
+		cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
+		max_cache_size = cache_line * cache_nlines;
+	}
 	if (cpu_data(id).mid < 0)
 		panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
 }
diff --git a/trunk/arch/sparc64/kernel/smp.c b/trunk/arch/sparc64/kernel/smp.c
index 40e40f968d61..4dcd7d0b60f2 100644
--- a/trunk/arch/sparc64/kernel/smp.c
+++ b/trunk/arch/sparc64/kernel/smp.c
@@ -1163,6 +1163,32 @@ int setup_profiling_timer(unsigned int multiplier)
 	return -EINVAL;
 }
 
+static void __init smp_tune_scheduling(void)
+{
+	unsigned int smallest = ~0U;
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		unsigned int val = cpu_data(i).ecache_size;
+
+		if (val && val < smallest)
+			smallest = val;
+	}
+
+	/* Any value less than 256K is nonsense.  */
+	if (smallest < (256U * 1024U))
+		smallest = 256 * 1024;
+
+	max_cache_size = smallest;
+
+	if (smallest < 1U * 1024U * 1024U)
+		printk(KERN_INFO "Using max_cache_size of %uKB\n",
+		       smallest / 1024U);
+	else
+		printk(KERN_INFO "Using max_cache_size of %uMB\n",
+		       smallest / 1024U / 1024U);
+}
+
 /* Constrain the number of cpus to max_cpus.  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
@@ -1180,6 +1206,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	}
 
 	cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy;
+	smp_tune_scheduling();
 }
 
 void __devinit smp_prepare_boot_cpu(void)
diff --git a/trunk/drivers/ide/arm/icside.c b/trunk/drivers/ide/arm/icside.c
index 444a0b84f5bd..66f826252aee 100644
--- a/trunk/drivers/ide/arm/icside.c
+++ b/trunk/drivers/ide/arm/icside.c
@@ -448,21 +448,23 @@ static int icside_dma_test_irq(ide_drive_t *drive)
 			ICS_ARCIN_V6_INTRSTAT_1)) & 1;
 }
 
-static void icside_dma_timeout(ide_drive_t *drive)
+static int icside_dma_timeout(ide_drive_t *drive)
 {
 	printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name);
 
 	if (icside_dma_test_irq(drive))
-		return;
+		return 0;
 
-	ide_dump_status(drive, "DMA timeout", HWIF(drive)->INB(IDE_STATUS_REG));
+	ide_dump_status(drive, "DMA timeout",
+		HWIF(drive)->INB(IDE_STATUS_REG));
 
-	icside_dma_end(drive);
+	return icside_dma_end(drive);
 }
 
-static void icside_dma_lost_irq(ide_drive_t *drive)
+static int icside_dma_lostirq(ide_drive_t *drive)
 {
 	printk(KERN_ERR "%s: IRQ lost\n", drive->name);
+	return 1;
 }
 
 static void icside_dma_init(ide_hwif_t *hwif)
@@ -488,8 +490,8 @@ static void icside_dma_init(ide_hwif_t *hwif)
 	hwif->dma_start		= icside_dma_start;
 	hwif->ide_dma_end	= icside_dma_end;
 	hwif->ide_dma_test_irq	= icside_dma_test_irq;
-	hwif->dma_timeout	= icside_dma_timeout;
-	hwif->dma_lost_irq	= icside_dma_lost_irq;
+	hwif->ide_dma_timeout	= icside_dma_timeout;
+	hwif->ide_dma_lostirq	= icside_dma_lostirq;
 
 	hwif->drives[0].autodma = hwif->autodma;
 	hwif->drives[1].autodma = hwif->autodma;
diff --git a/trunk/drivers/ide/cris/ide-cris.c b/trunk/drivers/ide/cris/ide-cris.c
index 886091bc7db0..ca0341c05e55 100644
--- a/trunk/drivers/ide/cris/ide-cris.c
+++ b/trunk/drivers/ide/cris/ide-cris.c
@@ -819,7 +819,7 @@ init_e100_ide (void)
 		hwif->dma_host_off = &cris_dma_off;
 		hwif->dma_host_on = &cris_dma_on;
 		hwif->dma_off_quietly = &cris_dma_off;
-		hwif->cbl = ATA_CBL_PATA40;
+		hwif->udma_four = 0;
 		hwif->ultra_mask = cris_ultra_mask;
 		hwif->mwdma_mask = 0x07; /* Multiword DMA 0-2 */
 		hwif->autodma = 1;
diff --git a/trunk/drivers/ide/ide-cd.c b/trunk/drivers/ide/ide-cd.c
index 1486eb212ccc..252ab8295edf 100644
--- a/trunk/drivers/ide/ide-cd.c
+++ b/trunk/drivers/ide/ide-cd.c
@@ -481,7 +481,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
 		else
 			printk("  Unknown Error Type: ");
 
-		if (sense->sense_key < ARRAY_SIZE(sense_key_texts))
+		if (sense->sense_key < ARY_LEN(sense_key_texts))
 			s = sense_key_texts[sense->sense_key];
 
 		printk("%s -- (Sense key=0x%02x)\n", s, sense->sense_key);
@@ -491,7 +491,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
 				 sense->ascq);
 			s = buf;
 		} else {
-			int lo = 0, mid, hi = ARRAY_SIZE(sense_data_texts);
+			int lo = 0, mid, hi = ARY_LEN(sense_data_texts);
 			unsigned long key = (sense->sense_key << 16);
 			key |= (sense->asc << 8);
 			if (!(sense->ascq >= 0x80 && sense->ascq <= 0xdd))
@@ -524,7 +524,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
 
 		if (failed_command != NULL) {
 
-			int lo=0, mid, hi= ARRAY_SIZE(packet_command_texts);
+			int lo=0, mid, hi= ARY_LEN (packet_command_texts);
 			s = NULL;
 
 			while (hi > lo) {
diff --git a/trunk/drivers/ide/ide-cd.h b/trunk/drivers/ide/ide-cd.h
index 228b29c5d2e4..ad1f2ed14a37 100644
--- a/trunk/drivers/ide/ide-cd.h
+++ b/trunk/drivers/ide/ide-cd.h
@@ -498,6 +498,8 @@ struct cdrom_info {
  * Descriptions of ATAPI error codes.
  */
 
+#define ARY_LEN(a) ((sizeof(a) / sizeof(a[0])))
+
 /* This stuff should be in cdrom.h, since it is now generic... */
 
 /* ATAPI sense keys (from table 140 of ATAPI 2.6) */
diff --git a/trunk/drivers/ide/ide-disk.c b/trunk/drivers/ide/ide-disk.c
index b1304a7f3e0a..dc2175c81f5e 100644
--- a/trunk/drivers/ide/ide-disk.c
+++ b/trunk/drivers/ide/ide-disk.c
@@ -1190,11 +1190,11 @@ static int idedisk_ioctl(struct inode *inode, struct file *file,
 	return generic_ide_ioctl(drive, file, bdev, cmd, arg);
 
 read_val:
-	mutex_lock(&ide_setting_mtx);
+	down(&ide_setting_sem);
 	spin_lock_irqsave(&ide_lock, flags);
 	err = *val;
 	spin_unlock_irqrestore(&ide_lock, flags);
-	mutex_unlock(&ide_setting_mtx);
+	up(&ide_setting_sem);
 	return err >= 0 ? put_user(err, (long __user *)arg) : err;
 
 set_val:
@@ -1204,9 +1204,9 @@ static int idedisk_ioctl(struct inode *inode, struct file *file,
 		if (!capable(CAP_SYS_ADMIN))
 			err = -EACCES;
 		else {
-			mutex_lock(&ide_setting_mtx);
+			down(&ide_setting_sem);
 			err = setfunc(drive, arg);
-			mutex_unlock(&ide_setting_mtx);
+			up(&ide_setting_sem);
 		}
 	}
 	return err;
diff --git a/trunk/drivers/ide/ide-dma.c b/trunk/drivers/ide/ide-dma.c
index 5fe1d72ab451..ead141e2db9e 100644
--- a/trunk/drivers/ide/ide-dma.c
+++ b/trunk/drivers/ide/ide-dma.c
@@ -91,45 +91,45 @@
 
 static const struct drive_list_entry drive_whitelist [] = {
 
-	{ "Micropolis 2112A"	,       NULL		},
-	{ "CONNER CTMA 4000"	,       NULL		},
-	{ "CONNER CTT8000-A"	,       NULL		},
-	{ "ST34342A"		,	NULL		},
+	{ "Micropolis 2112A"	,       "ALL"		},
+	{ "CONNER CTMA 4000"	,       "ALL"		},
+	{ "CONNER CTT8000-A"	,       "ALL"		},
+	{ "ST34342A"		,	"ALL"		},
 	{ NULL			,	NULL		}
 };
 
 static const struct drive_list_entry drive_blacklist [] = {
 
-	{ "WDC AC11000H"	,	NULL 		},
-	{ "WDC AC22100H"	,	NULL 		},
-	{ "WDC AC32500H"	,	NULL 		},
-	{ "WDC AC33100H"	,	NULL 		},
-	{ "WDC AC31600H"	,	NULL 		},
+	{ "WDC AC11000H"	,	"ALL"		},
+	{ "WDC AC22100H"	,	"ALL"		},
+	{ "WDC AC32500H"	,	"ALL"		},
+	{ "WDC AC33100H"	,	"ALL"		},
+	{ "WDC AC31600H"	,	"ALL"		},
 	{ "WDC AC32100H"	,	"24.09P07"	},
 	{ "WDC AC23200L"	,	"21.10N21"	},
-	{ "Compaq CRD-8241B"	,	NULL 		},
-	{ "CRD-8400B"		,	NULL 		},
-	{ "CRD-8480B",			NULL 		},
-	{ "CRD-8482B",			NULL 		},
-	{ "CRD-84"		,	NULL 		},
-	{ "SanDisk SDP3B"	,	NULL 		},
-	{ "SanDisk SDP3B-64"	,	NULL 		},
-	{ "SANYO CD-ROM CRD"	,	NULL 		},
-	{ "HITACHI CDR-8"	,	NULL 		},
-	{ "HITACHI CDR-8335"	,	NULL 		},
-	{ "HITACHI CDR-8435"	,	NULL 		},
-	{ "Toshiba CD-ROM XM-6202B"	,	NULL 		},
-	{ "TOSHIBA CD-ROM XM-1702BC",	NULL 		},
-	{ "CD-532E-A"		,	NULL 		},
-	{ "E-IDE CD-ROM CR-840",	NULL 		},
-	{ "CD-ROM Drive/F5A",	NULL 		},
-	{ "WPI CDD-820",		NULL 		},
-	{ "SAMSUNG CD-ROM SC-148C",	NULL 		},
-	{ "SAMSUNG CD-ROM SC",	NULL 		},
-	{ "ATAPI CD-ROM DRIVE 40X MAXIMUM",	NULL 		},
-	{ "_NEC DV5800A",               NULL            },
+	{ "Compaq CRD-8241B"	,	"ALL"		},
+	{ "CRD-8400B"		,	"ALL"		},
+	{ "CRD-8480B",			"ALL"		},
+	{ "CRD-8482B",			"ALL"		},
+ 	{ "CRD-84"		,	"ALL"		},
+	{ "SanDisk SDP3B"	,	"ALL"		},
+	{ "SanDisk SDP3B-64"	,	"ALL"		},
+	{ "SANYO CD-ROM CRD"	,	"ALL"		},
+	{ "HITACHI CDR-8"	,	"ALL"		},
+	{ "HITACHI CDR-8335"	,	"ALL"		},
+	{ "HITACHI CDR-8435"	,	"ALL"		},
+	{ "Toshiba CD-ROM XM-6202B"	,	"ALL"		},
+	{ "TOSHIBA CD-ROM XM-1702BC",	"ALL"		},
+	{ "CD-532E-A"		,	"ALL"		},
+	{ "E-IDE CD-ROM CR-840",	"ALL"		},
+	{ "CD-ROM Drive/F5A",	"ALL"		},
+	{ "WPI CDD-820",		"ALL"		},
+	{ "SAMSUNG CD-ROM SC-148C",	"ALL"		},
+	{ "SAMSUNG CD-ROM SC",	"ALL"		},
+	{ "ATAPI CD-ROM DRIVE 40X MAXIMUM",	"ALL"		},
+	{ "_NEC DV5800A",               "ALL"           },  
 	{ "SAMSUNG CD-ROM SN-124",	"N001" },
-	{ "Seagate STT20000A",		NULL  },
+	{ "Seagate STT20000A",		"ALL" },
 	{ NULL			,	NULL		}
 
 };
@@ -147,8 +147,8 @@ int ide_in_drive_list(struct hd_driveid *id, const struct drive_list_entry *driv
 {
 	for ( ; drive_table->id_model ; drive_table++)
 		if ((!strcmp(drive_table->id_model, id->model)) &&
-		    (!drive_table->id_firmware ||
-		     strstr(id->fw_rev, drive_table->id_firmware)))
+		    ((strstr(id->fw_rev, drive_table->id_firmware)) ||
+		     (!strcmp(drive_table->id_firmware, "ALL"))))
 			return 1;
 	return 0;
 }
@@ -702,22 +702,8 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base)
 			mask = id->dma_mword & hwif->mwdma_mask;
 		break;
 	case XFER_SW_DMA_0:
-		if (id->field_valid & 2) {
+		if (id->field_valid & 2)
 			mask = id->dma_1word & hwif->swdma_mask;
-		} else if (id->tDMA) {
-			/*
-			 * ide_fix_driveid() doesn't convert ->tDMA to the
-			 * CPU endianness so we need to do it here
-			 */
-			u8 mode = le16_to_cpu(id->tDMA);
-
-			/*
-			 * if the mode is valid convert it to the mask
-			 * (the maximum allowed mode is XFER_SW_DMA_2)
-			 */
-			if (mode <= 2)
-				mask = ((2 << mode) - 1) & hwif->swdma_mask;
-		}
 		break;
 	default:
 		BUG();
@@ -861,27 +847,27 @@ int ide_set_dma(ide_drive_t *drive)
 	return rc;
 }
 
+EXPORT_SYMBOL_GPL(ide_set_dma);
+
 #ifdef CONFIG_BLK_DEV_IDEDMA_PCI
-void ide_dma_lost_irq (ide_drive_t *drive)
+int __ide_dma_lostirq (ide_drive_t *drive)
 {
 	printk("%s: DMA interrupt recovery\n", drive->name);
+	return 1;
 }
 
-EXPORT_SYMBOL(ide_dma_lost_irq);
+EXPORT_SYMBOL(__ide_dma_lostirq);
 
-void ide_dma_timeout (ide_drive_t *drive)
+int __ide_dma_timeout (ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = HWIF(drive);
-
 	printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name);
+	if (HWIF(drive)->ide_dma_test_irq(drive))
+		return 0;
 
-	if (hwif->ide_dma_test_irq(drive))
-		return;
-
-	hwif->ide_dma_end(drive);
+	return HWIF(drive)->ide_dma_end(drive);
 }
 
-EXPORT_SYMBOL(ide_dma_timeout);
+EXPORT_SYMBOL(__ide_dma_timeout);
 
 /*
  * Needed for allowing full modular support of ide-driver
@@ -1032,10 +1018,10 @@ void ide_setup_dma (ide_hwif_t *hwif, unsigned long dma_base, unsigned int num_p
 		hwif->ide_dma_end = &__ide_dma_end;
 	if (!hwif->ide_dma_test_irq)
 		hwif->ide_dma_test_irq = &__ide_dma_test_irq;
-	if (!hwif->dma_timeout)
-		hwif->dma_timeout = &ide_dma_timeout;
-	if (!hwif->dma_lost_irq)
-		hwif->dma_lost_irq = &ide_dma_lost_irq;
+	if (!hwif->ide_dma_timeout)
+		hwif->ide_dma_timeout = &__ide_dma_timeout;
+	if (!hwif->ide_dma_lostirq)
+		hwif->ide_dma_lostirq = &__ide_dma_lostirq;
 
 	if (hwif->chipset != ide_trm290) {
 		u8 dma_stat = hwif->INB(hwif->dma_status);
diff --git a/trunk/drivers/ide/ide-io.c b/trunk/drivers/ide/ide-io.c
index c5b5011da56e..bfe8f1b712ba 100644
--- a/trunk/drivers/ide/ide-io.c
+++ b/trunk/drivers/ide/ide-io.c
@@ -1350,7 +1350,7 @@ static ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 						hwif->INB(IDE_STATUS_REG));
 	} else {
 		printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
-		hwif->dma_timeout(drive);
+		(void) hwif->ide_dma_timeout(drive);
 	}
 
 	/*
@@ -1466,7 +1466,7 @@ void ide_timer_expiry (unsigned long data)
 				startstop = handler(drive);
 			} else if (drive_is_ready(drive)) {
 				if (drive->waiting_for_dma)
-					hwgroup->hwif->dma_lost_irq(drive);
+					(void) hwgroup->hwif->ide_dma_lostirq(drive);
 				(void)ide_ack_intr(hwif);
 				printk(KERN_WARNING "%s: lost interrupt\n", drive->name);
 				startstop = handler(drive);
diff --git a/trunk/drivers/ide/ide-iops.c b/trunk/drivers/ide/ide-iops.c
index 92578b6832e9..f0be5f665a0e 100644
--- a/trunk/drivers/ide/ide-iops.c
+++ b/trunk/drivers/ide/ide-iops.c
@@ -574,10 +574,7 @@ u8 eighty_ninty_three (ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	struct hd_driveid *id = drive->id;
 
-	if (hwif->cbl == ATA_CBL_PATA40_SHORT)
-		return 1;
-
-	if (hwif->cbl != ATA_CBL_PATA80)
+	if (hwif->udma_four == 0)
 		goto no_80w;
 
 	/* Check for SATA but only if we are ATA5 or higher */
@@ -603,8 +600,7 @@ u8 eighty_ninty_three (ide_drive_t *drive)
 
 	printk(KERN_WARNING "%s: %s side 80-wire cable detection failed, "
 			    "limiting max speed to UDMA33\n",
-			    drive->name,
-			    hwif->cbl == ATA_CBL_PATA80 ? "drive" : "host");
+			    drive->name, hwif->udma_four ? "drive" : "host");
 
 	drive->udma33_warned = 1;
 
diff --git a/trunk/drivers/ide/ide-probe.c b/trunk/drivers/ide/ide-probe.c
index cc5801399467..f5ce22c38f82 100644
--- a/trunk/drivers/ide/ide-probe.c
+++ b/trunk/drivers/ide/ide-probe.c
@@ -144,7 +144,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd)
 	local_irq_enable();
 	ide_fix_driveid(id);
 
-#if defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA)
+#if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA)
 	/*
 	 * EATA SCSI controllers do a hardware ATA emulation:
 	 * Ignore them if there is a driver for them available.
@@ -154,7 +154,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd)
 		printk("%s: EATA SCSI HBA %.10s\n", drive->name, id->model);
 		goto err_misc;
 	}
-#endif /* CONFIG_SCSI_EATA || CONFIG_SCSI_EATA_PIO */
+#endif /* CONFIG_SCSI_EATA_DMA || CONFIG_SCSI_EATA_PIO */
 
 	/*
 	 *  WIN_IDENTIFY returns little-endian info,
@@ -1025,7 +1025,7 @@ static int init_irq (ide_hwif_t *hwif)
 	BUG_ON(irqs_disabled());	
 	BUG_ON(hwif == NULL);
 
-	mutex_lock(&ide_cfg_mtx);
+	down(&ide_cfg_sem);
 	hwif->hwgroup = NULL;
 #if MAX_HWIFS > 1
 	/*
@@ -1154,7 +1154,7 @@ static int init_irq (ide_hwif_t *hwif)
 		printk(" (%sed with %s)",
 			hwif->sharing_irq ? "shar" : "serializ", match->name);
 	printk("\n");
-	mutex_unlock(&ide_cfg_mtx);
+	up(&ide_cfg_sem);
 	return 0;
 out_unlink:
 	spin_lock_irq(&ide_lock);
@@ -1177,7 +1177,7 @@ static int init_irq (ide_hwif_t *hwif)
 	}
 	spin_unlock_irq(&ide_lock);
 out_up:
-	mutex_unlock(&ide_cfg_mtx);
+	up(&ide_cfg_sem);
 	return 1;
 }
 
diff --git a/trunk/drivers/ide/ide-proc.c b/trunk/drivers/ide/ide-proc.c
index fc1d8ae6a803..ea94c9aa1220 100644
--- a/trunk/drivers/ide/ide-proc.c
+++ b/trunk/drivers/ide/ide-proc.c
@@ -156,7 +156,7 @@ static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int d
 {
 	ide_settings_t **p = (ide_settings_t **) &drive->settings, *setting = NULL;
 
-	mutex_lock(&ide_setting_mtx);
+	down(&ide_setting_sem);
 	while ((*p) && strcmp((*p)->name, name) < 0)
 		p = &((*p)->next);
 	if ((setting = kzalloc(sizeof(*setting), GFP_KERNEL)) == NULL)
@@ -177,10 +177,10 @@ static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int d
 	if (auto_remove)
 		setting->auto_remove = 1;
 	*p = setting;
-	mutex_unlock(&ide_setting_mtx);
+	up(&ide_setting_sem);
 	return 0;
 abort:
-	mutex_unlock(&ide_setting_mtx);
+	up(&ide_setting_sem);
 	kfree(setting);
 	return -1;
 }
@@ -224,7 +224,7 @@ static void __ide_remove_setting (ide_drive_t *drive, char *name)
  *
  *	Automatically remove all the driver specific settings for this
  *	drive. This function may not be called from IRQ context. The
- *	caller must hold ide_setting_mtx.
+ *	caller must hold ide_setting_sem.
  */
 
 static void auto_remove_settings (ide_drive_t *drive)
@@ -269,7 +269,7 @@ static ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name)
  *	@setting: drive setting
  *
  *	Read a drive setting and return the value. The caller
- *	must hold the ide_setting_mtx when making this call.
+ *	must hold the ide_setting_sem when making this call.
  *
  *	BUGS: the data return and error are the same return value
  *	so an error -EINVAL and true return of the same value cannot
@@ -306,7 +306,7 @@ static int ide_read_setting(ide_drive_t *drive, ide_settings_t *setting)
  *	@val: value
  *
  *	Write a drive setting if it is possible. The caller
- *	must hold the ide_setting_mtx when making this call.
+ *	must hold the ide_setting_sem when making this call.
  *
  *	BUGS: the data return and error are the same return value
  *	so an error -EINVAL and true return of the same value cannot
@@ -367,7 +367,7 @@ static int set_xfer_rate (ide_drive_t *drive, int arg)
  *	@drive: drive being configured
  *
  *	Add the generic parts of the system settings to the /proc files.
- *	The caller must not be holding the ide_setting_mtx.
+ *	The caller must not be holding the ide_setting_sem.
  */
 
 void ide_add_generic_settings (ide_drive_t *drive)
@@ -408,7 +408,7 @@ static int proc_ide_read_settings
 
 	proc_ide_settings_warn();
 
-	mutex_lock(&ide_setting_mtx);
+	down(&ide_setting_sem);
 	out += sprintf(out, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n");
 	out += sprintf(out, "----\t\t\t-----\t\t---\t\t---\t\t----\n");
 	while(setting) {
@@ -428,7 +428,7 @@ static int proc_ide_read_settings
 		setting = setting->next;
 	}
 	len = out - page;
-	mutex_unlock(&ide_setting_mtx);
+	up(&ide_setting_sem);
 	PROC_IDE_READ_RETURN(page,start,off,count,eof,len);
 }
 
@@ -508,16 +508,16 @@ static int proc_ide_write_settings(struct file *file, const char __user *buffer,
 				++p;
 			}
 
-			mutex_lock(&ide_setting_mtx);
+			down(&ide_setting_sem);
 			setting = ide_find_setting_by_name(drive, name);
 			if (!setting)
 			{
-				mutex_unlock(&ide_setting_mtx);
+				up(&ide_setting_sem);
 				goto parse_error;
 			}
 			if (for_real)
 				ide_write_setting(drive, setting, val * setting->div_factor / setting->mul_factor);
-			mutex_unlock(&ide_setting_mtx);
+			up(&ide_setting_sem);
 		}
 	} while (!for_real++);
 	free_page((unsigned long)buf);
@@ -705,7 +705,7 @@ EXPORT_SYMBOL(ide_proc_register_driver);
  *	Clean up the driver specific /proc files and IDE settings
  *	for a given drive.
  *
- *	Takes ide_setting_mtx and ide_lock.
+ *	Takes ide_setting_sem and ide_lock.
  *	Caller must hold none of the locks.
  */
 
@@ -715,10 +715,10 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver)
 
 	ide_remove_proc_entries(drive->proc, driver->proc);
 
-	mutex_lock(&ide_setting_mtx);
+	down(&ide_setting_sem);
 	spin_lock_irqsave(&ide_lock, flags);
 	/*
-	 * ide_setting_mtx protects the settings list
+	 * ide_setting_sem protects the settings list
 	 * ide_lock protects the use of settings
 	 *
 	 * so we need to hold both, ide_settings_sem because we want to
@@ -726,11 +726,11 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver)
 	 * a setting out that is being used.
 	 *
 	 * OTOH both ide_{read,write}_setting are only ever used under
-	 * ide_setting_mtx.
+	 * ide_setting_sem.
 	 */
 	auto_remove_settings(drive);
 	spin_unlock_irqrestore(&ide_lock, flags);
-	mutex_unlock(&ide_setting_mtx);
+	up(&ide_setting_sem);
 }
 
 EXPORT_SYMBOL(ide_proc_unregister_driver);
diff --git a/trunk/drivers/ide/ide-timing.h b/trunk/drivers/ide/ide-timing.h
index e6cb8593b5ba..c0864b1e9228 100644
--- a/trunk/drivers/ide/ide-timing.h
+++ b/trunk/drivers/ide/ide-timing.h
@@ -102,16 +102,66 @@ static struct ide_timing ide_timing[] = {
 #define EZ(v,unit)		((v)?ENOUGH(v,unit):0)
 
 #define XFER_MODE	0xf0
+#define XFER_UDMA_133	0x48
+#define XFER_UDMA_100	0x44
+#define XFER_UDMA_66	0x42
+#define XFER_UDMA	0x40
 #define XFER_MWDMA	0x20
+#define XFER_SWDMA	0x10
 #define XFER_EPIO	0x01
 #define XFER_PIO	0x00
 
-static short ide_find_best_pio_mode(ide_drive_t *drive)
+static short ide_find_best_mode(ide_drive_t *drive, int map)
 {
 	struct hd_driveid *id = drive->id;
 	short best = 0;
 
-	if (id->field_valid & 2) {	/* EIDE PIO modes */
+	if (!id)
+		return XFER_PIO_SLOW;
+
+	if ((map & XFER_UDMA) && (id->field_valid & 4)) {	/* Want UDMA and UDMA bitmap valid */
+
+		if ((map & XFER_UDMA_133) == XFER_UDMA_133)
+			if ((best = (id->dma_ultra & 0x0040) ? XFER_UDMA_6 : 0)) return best;
+
+		if ((map & XFER_UDMA_100) == XFER_UDMA_100)
+			if ((best = (id->dma_ultra & 0x0020) ? XFER_UDMA_5 : 0)) return best;
+
+		if ((map & XFER_UDMA_66) == XFER_UDMA_66)
+			if ((best = (id->dma_ultra & 0x0010) ? XFER_UDMA_4 :
+                	    	    (id->dma_ultra & 0x0008) ? XFER_UDMA_3 : 0)) return best;
+
+                if ((best = (id->dma_ultra & 0x0004) ? XFER_UDMA_2 :
+                	    (id->dma_ultra & 0x0002) ? XFER_UDMA_1 :
+                	    (id->dma_ultra & 0x0001) ? XFER_UDMA_0 : 0)) return best;
+	}
+
+	if ((map & XFER_MWDMA) && (id->field_valid & 2)) {	/* Want MWDMA and drive has EIDE fields */
+
+		if ((best = (id->dma_mword & 0x0004) ? XFER_MW_DMA_2 :
+                	    (id->dma_mword & 0x0002) ? XFER_MW_DMA_1 :
+                	    (id->dma_mword & 0x0001) ? XFER_MW_DMA_0 : 0)) return best;
+	}
+
+	if (map & XFER_SWDMA) {					/* Want SWDMA */
+
+ 		if (id->field_valid & 2) {			/* EIDE SWDMA */
+
+			if ((best = (id->dma_1word & 0x0004) ? XFER_SW_DMA_2 :
+      				    (id->dma_1word & 0x0002) ? XFER_SW_DMA_1 :
+				    (id->dma_1word & 0x0001) ? XFER_SW_DMA_0 : 0)) return best;
+		}
+
+		if (id->capability & 1) {			/* Pre-EIDE style SWDMA */
+
+			if ((best = (id->tDMA == 2) ? XFER_SW_DMA_2 :
+				    (id->tDMA == 1) ? XFER_SW_DMA_1 :
+				    (id->tDMA == 0) ? XFER_SW_DMA_0 : 0)) return best;
+		}
+	}
+
+
+	if ((map & XFER_EPIO) && (id->field_valid & 2)) {	/* EIDE PIO modes */
 
 		if ((best = (drive->id->eide_pio_modes & 4) ? XFER_PIO_5 :
 			    (drive->id->eide_pio_modes & 2) ? XFER_PIO_4 :
@@ -212,7 +262,7 @@ static int ide_timing_compute(ide_drive_t *drive, short speed, struct ide_timing
  */
 
 	if ((speed & XFER_MODE) != XFER_PIO) {
-		ide_timing_compute(drive, ide_find_best_pio_mode(drive), &p, T, UT);
+		ide_timing_compute(drive, ide_find_best_mode(drive, XFER_PIO | XFER_EPIO), &p, T, UT);
 		ide_timing_merge(&p, t, t, IDE_TIMING_ALL);
 	}
 
diff --git a/trunk/drivers/ide/ide.c b/trunk/drivers/ide/ide.c
index c948a5c17a5d..0cd76bf66833 100644
--- a/trunk/drivers/ide/ide.c
+++ b/trunk/drivers/ide/ide.c
@@ -169,7 +169,7 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR,
 static int idebus_parameter;	/* holds the "idebus=" parameter */
 static int system_bus_speed;	/* holds what we think is VESA/PCI bus speed */
 
-DEFINE_MUTEX(ide_cfg_mtx);
+DECLARE_MUTEX(ide_cfg_sem);
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock);
 
 #ifdef CONFIG_IDEPCI_PCIBUS_ORDER
@@ -460,8 +460,6 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
 	hwif->mwdma_mask		= tmp_hwif->mwdma_mask;
 	hwif->swdma_mask		= tmp_hwif->swdma_mask;
 
-	hwif->cbl			= tmp_hwif->cbl;
-
 	hwif->chipset			= tmp_hwif->chipset;
 	hwif->hold			= tmp_hwif->hold;
 
@@ -498,8 +496,8 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
 	hwif->ide_dma_clear_irq		= tmp_hwif->ide_dma_clear_irq;
 	hwif->dma_host_on		= tmp_hwif->dma_host_on;
 	hwif->dma_host_off		= tmp_hwif->dma_host_off;
-	hwif->dma_lost_irq		= tmp_hwif->dma_lost_irq;
-	hwif->dma_timeout		= tmp_hwif->dma_timeout;
+	hwif->ide_dma_lostirq		= tmp_hwif->ide_dma_lostirq;
+	hwif->ide_dma_timeout		= tmp_hwif->ide_dma_timeout;
 
 	hwif->OUTB			= tmp_hwif->OUTB;
 	hwif->OUTBSYNC			= tmp_hwif->OUTBSYNC;
@@ -535,6 +533,7 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
 	hwif->extra_base		= tmp_hwif->extra_base;
 	hwif->extra_ports		= tmp_hwif->extra_ports;
 	hwif->autodma			= tmp_hwif->autodma;
+	hwif->udma_four			= tmp_hwif->udma_four;
 
 	hwif->hwif_data			= tmp_hwif->hwif_data;
 }
@@ -565,7 +564,7 @@ void ide_unregister(unsigned int index)
 {
 	ide_drive_t *drive;
 	ide_hwif_t *hwif, *g;
-	static ide_hwif_t tmp_hwif; /* protected by ide_cfg_mtx */
+	static ide_hwif_t tmp_hwif; /* protected by ide_cfg_sem */
 	ide_hwgroup_t *hwgroup;
 	int irq_count = 0, unit;
 
@@ -573,7 +572,7 @@ void ide_unregister(unsigned int index)
 
 	BUG_ON(in_interrupt());
 	BUG_ON(irqs_disabled());
-	mutex_lock(&ide_cfg_mtx);
+	down(&ide_cfg_sem);
 	spin_lock_irq(&ide_lock);
 	hwif = &ide_hwifs[index];
 	if (!hwif->present)
@@ -680,7 +679,7 @@ void ide_unregister(unsigned int index)
 
 abort:
 	spin_unlock_irq(&ide_lock);
-	mutex_unlock(&ide_cfg_mtx);
+	up(&ide_cfg_sem);
 }
 
 EXPORT_SYMBOL(ide_unregister);
@@ -818,9 +817,9 @@ EXPORT_SYMBOL(ide_register_hw);
  *	Locks for IDE setting functionality
  */
 
-DEFINE_MUTEX(ide_setting_mtx);
+DECLARE_MUTEX(ide_setting_sem);
 
-EXPORT_SYMBOL_GPL(ide_setting_mtx);
+EXPORT_SYMBOL_GPL(ide_setting_sem);
 
 /**
  *	ide_spin_wait_hwgroup	-	wait for group
@@ -1193,11 +1192,11 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 	}
 
 read_val:
-	mutex_lock(&ide_setting_mtx);
+	down(&ide_setting_sem);
 	spin_lock_irqsave(&ide_lock, flags);
 	err = *val;
 	spin_unlock_irqrestore(&ide_lock, flags);
-	mutex_unlock(&ide_setting_mtx);
+	up(&ide_setting_sem);
 	return err >= 0 ? put_user(err, (long __user *)arg) : err;
 
 set_val:
@@ -1207,9 +1206,9 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 		if (!capable(CAP_SYS_ADMIN))
 			err = -EACCES;
 		else {
-			mutex_lock(&ide_setting_mtx);
+			down(&ide_setting_sem);
 			err = setfunc(drive, arg);
-			mutex_unlock(&ide_setting_mtx);
+			up(&ide_setting_sem);
 		}
 	}
 	return err;
@@ -1549,11 +1548,7 @@ static int __init ide_setup(char *s)
 				goto bad_option;
 			case -7: /* ata66 */
 #ifdef CONFIG_BLK_DEV_IDEPCI
-				/*
-				 * Use ATA_CBL_PATA40_SHORT so drive side
-				 * cable detection is also overriden.
-				 */
-				hwif->cbl = ATA_CBL_PATA40_SHORT;
+				hwif->udma_four = 1;
 				goto obsolete_option;
 #else
 				goto bad_hwif;
diff --git a/trunk/drivers/ide/legacy/hd.c b/trunk/drivers/ide/legacy/hd.c
index 661c12f6dda6..45ed03591cd8 100644
--- a/trunk/drivers/ide/legacy/hd.c
+++ b/trunk/drivers/ide/legacy/hd.c
@@ -130,7 +130,7 @@ struct hd_i_struct {
 	
 #ifdef HD_TYPE
 static struct hd_i_struct hd_info[] = { HD_TYPE };
-static int NR_HD = ARRAY_SIZE(hd_info);
+static int NR_HD = ((sizeof (hd_info))/(sizeof (struct hd_i_struct)));
 #else
 static struct hd_i_struct hd_info[MAX_HD];
 static int NR_HD;
diff --git a/trunk/drivers/ide/legacy/macide.c b/trunk/drivers/ide/legacy/macide.c
index b557c45a5a9d..c211fc78345d 100644
--- a/trunk/drivers/ide/legacy/macide.c
+++ b/trunk/drivers/ide/legacy/macide.c
@@ -77,6 +77,15 @@ int macide_ack_intr(ide_hwif_t* hwif)
 	return 0;
 }
 
+#ifdef CONFIG_BLK_DEV_MAC_MEDIABAY
+static void macide_mediabay_interrupt(int irq, void *dev_id)
+{
+	int state = baboon->mb_status & 0x04;
+
+	printk(KERN_INFO "macide: media bay %s detected\n", state? "removal":"insertion");
+}
+#endif
+
 /*
  * Probe for a Macintosh IDE interface
  */
@@ -119,6 +128,11 @@ void macide_init(void)
 			ide_drive_t *drive = &ide_hwifs[index].drives[0];
 			drive->capacity64 = drive->cyl*drive->head*drive->sect;
 
+#ifdef CONFIG_BLK_DEV_MAC_MEDIABAY
+			request_irq(IRQ_BABOON_2, macide_mediabay_interrupt,
+					IRQ_FLG_FAST, "mediabay",
+					macide_mediabay_interrupt);
+#endif
 		}
 		break;
 
diff --git a/trunk/drivers/ide/mips/au1xxx-ide.c b/trunk/drivers/ide/mips/au1xxx-ide.c
index 2e7013a2a7f6..ca95e990862e 100644
--- a/trunk/drivers/ide/mips/au1xxx-ide.c
+++ b/trunk/drivers/ide/mips/au1xxx-ide.c
@@ -381,7 +381,9 @@ static int auide_dma_setup(ide_drive_t *drive)
 
 static int auide_dma_check(ide_drive_t *drive)
 {
-	u8 speed = ide_max_dma_mode(drive);
+	u8 speed;
+
+#ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
 
 	if( dbdma_init_done == 0 ){
 		auide_hwif.white_list = ide_in_drive_list(drive->id,
@@ -392,6 +394,7 @@ static int auide_dma_check(ide_drive_t *drive)
 		auide_ddma_init(&auide_hwif);
 		dbdma_init_done = 1;
 	}
+#endif
 
 	/* Is the drive in our DMA black list? */
 
@@ -406,6 +409,8 @@ static int auide_dma_check(ide_drive_t *drive)
 	else
 		drive->using_dma = 1;
 
+	speed = ide_find_best_mode(drive, XFER_PIO | XFER_MWDMA);
+	
 	if (drive->autodma && (speed & XFER_MODE) != XFER_PIO)
 		return 0;
 
@@ -451,9 +456,10 @@ static void auide_dma_off_quietly(ide_drive_t *drive)
 	drive->using_dma = 0;
 }
 
-static void auide_dma_lost_irq(ide_drive_t *drive)
+static int auide_dma_lostirq(ide_drive_t *drive)
 {
 	printk(KERN_ERR "%s: IRQ lost\n", drive->name);
+	return 0;
 }
 
 static void auide_ddma_tx_callback(int irq, void *param)
@@ -483,16 +489,16 @@ static void auide_init_dbdma_dev(dbdev_tab_t *dev, u32 dev_id, u32 tsize, u32 de
   
 #if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA)
 
-static void auide_dma_timeout(ide_drive_t *drive)
+static int auide_dma_timeout(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = HWIF(drive);
+//      printk("%s\n", __FUNCTION__);
 
 	printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name);
 
-	if (hwif->ide_dma_test_irq(drive))
-		return;
+	if (HWIF(drive)->ide_dma_test_irq(drive))
+		return 0;
 
-	hwif->ide_dma_end(drive);
+	return HWIF(drive)->ide_dma_end(drive);
 }
 					
 
@@ -715,7 +721,7 @@ static int au_ide_probe(struct device *dev)
 
 #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
 	hwif->dma_off_quietly		= &auide_dma_off_quietly;
-	hwif->dma_timeout		= &auide_dma_timeout;
+	hwif->ide_dma_timeout           = &auide_dma_timeout;
 
 	hwif->ide_dma_check             = &auide_dma_check;
 	hwif->dma_exec_cmd              = &auide_dma_exec_cmd;
@@ -725,7 +731,7 @@ static int au_ide_probe(struct device *dev)
 	hwif->ide_dma_test_irq          = &auide_dma_test_irq;
 	hwif->dma_host_off		= &auide_dma_host_off;
 	hwif->dma_host_on		= &auide_dma_host_on;
-	hwif->dma_lost_irq		= &auide_dma_lost_irq;
+	hwif->ide_dma_lostirq           = &auide_dma_lostirq;
 	hwif->ide_dma_on                = &auide_dma_on;
 
 	hwif->autodma                   = 1;
diff --git a/trunk/drivers/ide/pci/aec62xx.c b/trunk/drivers/ide/pci/aec62xx.c
index e5d09367627e..b173bc66ce1e 100644
--- a/trunk/drivers/ide/pci/aec62xx.c
+++ b/trunk/drivers/ide/pci/aec62xx.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/aec62xx.c		Version 0.24	May 24, 2007
+ * linux/drivers/ide/pci/aec62xx.c		Version 0.21	Apr 21, 2007
  *
  * Copyright (C) 1999-2002	Andre Hedrick <andre@linux-ide.org>
  * Copyright (C) 2007		MontaVista Software, Inc. <source@mvista.com>
@@ -140,10 +140,25 @@ static int aec6260_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	return(ide_config_drive_speed(drive, speed));
 }
 
+static int aec62xx_tune_chipset (ide_drive_t *drive, u8 speed)
+{
+	switch (HWIF(drive)->pci_dev->device) {
+		case PCI_DEVICE_ID_ARTOP_ATP865:
+		case PCI_DEVICE_ID_ARTOP_ATP865R:
+		case PCI_DEVICE_ID_ARTOP_ATP860:
+		case PCI_DEVICE_ID_ARTOP_ATP860R:
+			return ((int) aec6260_tune_chipset(drive, speed));
+		case PCI_DEVICE_ID_ARTOP_ATP850UF:
+			return ((int) aec6210_tune_chipset(drive, speed));
+		default:
+			return -1;
+	}
+}
+
 static void aec62xx_tune_drive (ide_drive_t *drive, u8 pio)
 {
 	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
-	(void) HWIF(drive)->speedproc(drive, pio + XFER_PIO_0);
+	(void) aec62xx_tune_chipset(drive, pio + XFER_PIO_0);
 }
 
 static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive)
@@ -157,9 +172,12 @@ static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive)
 	return -1;
 }
 
-static void aec62xx_dma_lost_irq (ide_drive_t *drive)
+static int aec62xx_irq_timeout (ide_drive_t *drive)
 {
-	switch (HWIF(drive)->pci_dev->device) {
+	ide_hwif_t *hwif	= HWIF(drive);
+	struct pci_dev *dev	= hwif->pci_dev;
+
+	switch(dev->device) {
 		case PCI_DEVICE_ID_ARTOP_ATP860:
 		case PCI_DEVICE_ID_ARTOP_ATP860R:
 		case PCI_DEVICE_ID_ARTOP_ATP865:
@@ -168,6 +186,7 @@ static void aec62xx_dma_lost_irq (ide_drive_t *drive)
 		default:
 			break;
 	}
+	return 0;
 }
 
 static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const char *name)
@@ -205,46 +224,64 @@ static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const ch
 
 static void __devinit init_hwif_aec62xx(ide_hwif_t *hwif)
 {
-	struct pci_dev *dev	= hwif->pci_dev;
-	u8 reg54 = 0,  mask	= hwif->channel ? 0xf0 : 0x0f;
-	unsigned long flags;
+	struct pci_dev *dev = hwif->pci_dev;
 
+	hwif->autodma = 0;
 	hwif->tuneproc = &aec62xx_tune_drive;
+	hwif->speedproc = &aec62xx_tune_chipset;
 
-	if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) {
-		if(hwif->mate)
-			hwif->mate->serialized = hwif->serialized = 1;
-		hwif->speedproc = &aec6210_tune_chipset;
-	} else
-		hwif->speedproc = &aec6260_tune_chipset;
+	if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF)
+		hwif->serialized = hwif->channel;
+
+	if (hwif->mate)
+		hwif->mate->serialized = hwif->serialized;
 
 	if (!hwif->dma_base) {
-		hwif->drives[0].autotune = hwif->drives[1].autotune = 1;
+		hwif->drives[0].autotune = 1;
+		hwif->drives[1].autotune = 1;
 		return;
 	}
 
 	hwif->ultra_mask = hwif->cds->udma_mask;
+
+	/* atp865 and atp865r */
+	if (hwif->ultra_mask == 0x3f) {
+		/* check bit 0x10 of DMA status register */
+		if (inb(pci_resource_start(dev, 4) + 2) & 0x10)
+ 			hwif->ultra_mask = 0x7f; /* udma0-6 */
+	}
+
 	hwif->mwdma_mask = 0x07;
 
 	hwif->ide_dma_check	= &aec62xx_config_drive_xfer_rate;
-	hwif->dma_lost_irq	= &aec62xx_dma_lost_irq;
+	hwif->ide_dma_lostirq	= &aec62xx_irq_timeout;
+
+	if (!noautodma)
+		hwif->autodma = 1;
+	hwif->drives[0].autodma = hwif->autodma;
+	hwif->drives[1].autodma = hwif->autodma;
+}
+
+static void __devinit init_dma_aec62xx(ide_hwif_t *hwif, unsigned long dmabase)
+{
+	struct pci_dev *dev	= hwif->pci_dev;
 
 	if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) {
+		u8 reg54h = 0;
+		unsigned long flags;
+
 		spin_lock_irqsave(&ide_lock, flags);
-		pci_read_config_byte (dev, 0x54, &reg54);
-		pci_write_config_byte(dev, 0x54, (reg54 & ~mask));
+		pci_read_config_byte(dev, 0x54, &reg54h);
+		pci_write_config_byte(dev, 0x54, reg54h & ~(hwif->channel ? 0xF0 : 0x0F));
 		spin_unlock_irqrestore(&ide_lock, flags);
-	} else if (hwif->cbl != ATA_CBL_PATA40_SHORT) {
-		u8 ata66 = 0, mask = hwif->channel ? 0x02 : 0x01;
-
+	} else {
+		u8 ata66	= 0;
 		pci_read_config_byte(hwif->pci_dev, 0x49, &ata66);
-
-		hwif->cbl = (ata66 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
+	        if (!(hwif->udma_four))
+			hwif->udma_four = (ata66&(hwif->channel?0x02:0x01))?0:1;
 	}
 
-	if (!noautodma)
-		hwif->autodma = 1;
-	hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma;
+	ide_setup_dma(hwif, dmabase, 8);
 }
 
 static int __devinit init_setup_aec62xx(struct pci_dev *dev, ide_pci_device_t *d)
@@ -254,12 +291,16 @@ static int __devinit init_setup_aec62xx(struct pci_dev *dev, ide_pci_device_t *d
 
 static int __devinit init_setup_aec6x80(struct pci_dev *dev, ide_pci_device_t *d)
 {
-	unsigned long dma_base = pci_resource_start(dev, 4);
-
-	if (inb(dma_base + 2) & 0x10) {
-		d->name = (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R) ?
-			  "AEC6880R" : "AEC6880";
-		d->udma_mask = 0x7f; /* udma0-6 */
+	unsigned long bar4reg = pci_resource_start(dev, 4);
+
+	if (inb(bar4reg+2) & 0x10) {
+		strcpy(d->name, "AEC6880");
+		if (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R)
+			strcpy(d->name, "AEC6880R");
+	} else {
+		strcpy(d->name, "AEC6280");
+		if (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R)
+			strcpy(d->name, "AEC6280R");
 	}
 
 	return ide_setup_pci_device(dev, d);
@@ -271,6 +312,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_aec62xx,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
+		.init_dma	= init_dma_aec62xx,
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
@@ -281,6 +323,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_aec62xx,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
+		.init_dma	= init_dma_aec62xx,
 		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.bootable	= OFF_BOARD,
@@ -290,25 +333,28 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_aec62xx,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
+		.init_dma	= init_dma_aec62xx,
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= NEVER_BOARD,
 		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 3 */
-		.name		= "AEC6280",
+		.name		= "AEC6X80",
 		.init_setup	= init_setup_aec6x80,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
+		.init_dma	= init_dma_aec62xx,
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x3f, /* udma0-5 */
 	},{	/* 4 */
-		.name		= "AEC6280R",
+		.name		= "AEC6X80R",
 		.init_setup	= init_setup_aec6x80,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
+		.init_dma	= init_dma_aec62xx,
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
@@ -324,16 +370,13 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
  *
  *	Called when the PCI registration layer (or the IDE initialization)
  *	finds a device matching our IDE device tables.
- *
- *	NOTE: since we're going to modify the 'name' field for AEC-6[26]80[R]
- *	chips, pass a local copy of 'struct pci_device_id' down the call chain.
  */
  
 static int __devinit aec62xx_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
-	ide_pci_device_t d = aec62xx_chipsets[id->driver_data];
+	ide_pci_device_t *d = &aec62xx_chipsets[id->driver_data];
 
-	return d.init_setup(dev, &d);
+	return d->init_setup(dev, d);
 }
 
 static struct pci_device_id aec62xx_pci_tbl[] = {
diff --git a/trunk/drivers/ide/pci/alim15x3.c b/trunk/drivers/ide/pci/alim15x3.c
index 8a6b27b3bcc3..27525ec2e19a 100644
--- a/trunk/drivers/ide/pci/alim15x3.c
+++ b/trunk/drivers/ide/pci/alim15x3.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/alim15x3.c		Version 0.25	Jun 9 2007
+ * linux/drivers/ide/pci/alim15x3.c		Version 0.21	2007/02/03
  *
  *  Copyright (C) 1998-2000 Michel Aubry, Maintainer
  *  Copyright (C) 1998-2000 Andrzej Krzysztofowicz, Maintainer
@@ -10,7 +10,6 @@
  *  Copyright (C) 2002 Alan Cox <alan@redhat.com>
  *  ALi (now ULi M5228) support by Clear Zhang <Clear.Zhang@ali.com.tw>
  *  Copyright (C) 2007 MontaVista Software, Inc. <source@mvista.com>
- *  Copyright (C) 2007 Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
  *
  *  (U)DMA capable version of ali 1533/1543(C), 1535(D)
  *
@@ -37,7 +36,6 @@
 #include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
-#include <linux/dmi.h>
 
 #include <asm/io.h>
 
@@ -585,35 +583,6 @@ static unsigned int __devinit init_chipset_ali15x3 (struct pci_dev *dev, const c
 	return 0;
 }
 
-/*
- *	Cable special cases
- */
-
-static struct dmi_system_id cable_dmi_table[] = {
-	{
-		.ident = "HP Pavilion N5430",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
-			DMI_MATCH(DMI_BOARD_NAME, "OmniBook N32N-736"),
-		},
-	},
-	{ }
-};
-
-static int ali_cable_override(struct pci_dev *pdev)
-{
-	/* Fujitsu P2000 */
-	if (pdev->subsystem_vendor == 0x10CF &&
-	    pdev->subsystem_device == 0x10AF)
-		return 1;
-
-	/* Systems by DMI */
-	if (dmi_check_system(cable_dmi_table))
-		return 1;
-
-	return 0;
-}
-
 /**
  *	ata66_ali15x3	-	check for UDMA 66 support
  *	@hwif: IDE interface
@@ -625,31 +594,37 @@ static int ali_cable_override(struct pci_dev *pdev)
  *	FIXME: frobs bits that are not defined on newer ALi devicea
  */
 
-static u8 __devinit ata66_ali15x3(ide_hwif_t *hwif)
+static unsigned int __devinit ata66_ali15x3 (ide_hwif_t *hwif)
 {
 	struct pci_dev *dev	= hwif->pci_dev;
+	unsigned int ata66	= 0;
+	u8 cable_80_pin[2]	= { 0, 0 };
+
 	unsigned long flags;
-	u8 cbl = ATA_CBL_PATA40, tmpbyte;
+	u8 tmpbyte;
 
 	local_irq_save(flags);
 
 	if (m5229_revision >= 0xC2) {
 		/*
-		 * m5229 80-pin cable detection (from Host View)
-		 *
-		 * 0x4a bit0 is 0 => primary channel has 80-pin
-		 * 0x4a bit1 is 0 => secondary channel has 80-pin
-		 *
-		 * Certain laptops use short but suitable cables
-		 * and don't implement the detect logic.
+		 * Ultra66 cable detection (from Host View)
+		 * m5229, 0x4a, bit0: primary, bit1: secondary 80 pin
 		 */
-		if (ali_cable_override(dev))
-			cbl = ATA_CBL_PATA40_SHORT;
-		else {
-			pci_read_config_byte(dev, 0x4a, &tmpbyte);
-			if ((tmpbyte & (1 << hwif->channel)) == 0)
-				cbl = ATA_CBL_PATA80;
-		}
+		pci_read_config_byte(dev, 0x4a, &tmpbyte);
+		/*
+		 * 0x4a, bit0 is 0 => primary channel
+		 * has 80-pin (from host view)
+		 */
+		if (!(tmpbyte & 0x01)) cable_80_pin[0] = 1;
+		/*
+		 * 0x4a, bit1 is 0 => secondary channel
+		 * has 80-pin (from host view)
+		 */
+		if (!(tmpbyte & 0x02)) cable_80_pin[1] = 1;
+		/*
+		 * Allow ata66 if cable of current channel has 80 pins
+		 */
+		ata66 = (hwif->channel)?cable_80_pin[1]:cable_80_pin[0];
 	} else {
 		/*
 		 * check m1533, 0x5e, bit 1~4 == 1001 => & 00011110 = 00010010
@@ -682,7 +657,7 @@ static u8 __devinit ata66_ali15x3(ide_hwif_t *hwif)
 
 	local_irq_restore(flags);
 
-	return cbl;
+	return(ata66);
 }
 
 /**
@@ -733,9 +708,8 @@ static void __devinit init_hwif_common_ali15x3 (ide_hwif_t *hwif)
 		hwif->dma_setup = &ali15x3_dma_setup;
 		if (!noautodma)
 			hwif->autodma = 1;
-
-		if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-			hwif->cbl = ata66_ali15x3(hwif);
+		if (!(hwif->udma_four))
+			hwif->udma_four = ata66_ali15x3(hwif);
 	}
 	hwif->drives[0].autodma = hwif->autodma;
 	hwif->drives[1].autodma = hwif->autodma;
diff --git a/trunk/drivers/ide/pci/amd74xx.c b/trunk/drivers/ide/pci/amd74xx.c
index 84ed30cdb324..a2be65fcf89c 100644
--- a/trunk/drivers/ide/pci/amd74xx.c
+++ b/trunk/drivers/ide/pci/amd74xx.c
@@ -1,11 +1,10 @@
 /*
- * Version 2.20
+ * Version 2.16
  *
  * AMD 755/756/766/8111 and nVidia nForce/2/2s/3/3s/CK804/MCP04
  * IDE driver for Linux.
  *
  * Copyright (c) 2000-2002 Vojtech Pavlik
- * Copyright (c) 2007 Bartlomiej Zolnierkiewicz
  *
  * Based on the work of:
  *      Andre Hedrick
@@ -38,6 +37,11 @@
 #define AMD_ADDRESS_SETUP	(0x0c + amd_config->base)
 #define AMD_UDMA_TIMING		(0x10 + amd_config->base)
 
+#define AMD_UDMA		0x07
+#define AMD_UDMA_33		0x01
+#define AMD_UDMA_66		0x02
+#define AMD_UDMA_100		0x03
+#define AMD_UDMA_133		0x04
 #define AMD_CHECK_SWDMA		0x08
 #define AMD_BAD_SWDMA		0x10
 #define AMD_BAD_FIFO		0x20
@@ -49,33 +53,32 @@
 
 static struct amd_ide_chip {
 	unsigned short id;
-	u8 base;
-	u8 udma_mask;
-	u8 flags;
+	unsigned long base;
+	unsigned char flags;
 } amd_ide_chips[] = {
-	{ PCI_DEVICE_ID_AMD_COBRA_7401,		 0x40, ATA_UDMA2, AMD_BAD_SWDMA },
-	{ PCI_DEVICE_ID_AMD_VIPER_7409,		 0x40, ATA_UDMA4, AMD_CHECK_SWDMA },
-	{ PCI_DEVICE_ID_AMD_VIPER_7411,		 0x40, ATA_UDMA5, AMD_BAD_FIFO },
-	{ PCI_DEVICE_ID_AMD_OPUS_7441,		 0x40, ATA_UDMA5, },
-	{ PCI_DEVICE_ID_AMD_8111_IDE,		 0x40, ATA_UDMA6, AMD_CHECK_SERENADE },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_IDE,	 0x50, ATA_UDMA5, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE,	 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE,	 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA,	 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE,	 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE,	 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA,	 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2,	 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE, 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE, 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE, 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE, 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE, 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE, 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE, 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE, 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE, 0x50, ATA_UDMA6, },
-	{ PCI_DEVICE_ID_AMD_CS5536_IDE,		 0x40, ATA_UDMA5, },
+	{ PCI_DEVICE_ID_AMD_COBRA_7401,		0x40, AMD_UDMA_33 | AMD_BAD_SWDMA },
+	{ PCI_DEVICE_ID_AMD_VIPER_7409,		0x40, AMD_UDMA_66 | AMD_CHECK_SWDMA },
+	{ PCI_DEVICE_ID_AMD_VIPER_7411,		0x40, AMD_UDMA_100 | AMD_BAD_FIFO },
+	{ PCI_DEVICE_ID_AMD_OPUS_7441,		0x40, AMD_UDMA_100 },
+	{ PCI_DEVICE_ID_AMD_8111_IDE,		0x40, AMD_UDMA_133 | AMD_CHECK_SERENADE },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_IDE,	0x50, AMD_UDMA_100 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_AMD_CS5536_IDE,			0x40, AMD_UDMA_100 },
 	{ 0 }
 };
 
@@ -84,7 +87,7 @@ static ide_pci_device_t *amd_chipset;
 static unsigned int amd_80w;
 static unsigned int amd_clock;
 
-static char *amd_dma[] = { "16", "25", "33", "44", "66", "100", "133" };
+static char *amd_dma[] = { "MWDMA16", "UDMA33", "UDMA66", "UDMA100", "UDMA133" };
 static unsigned char amd_cyc2udma[] = { 6, 6, 5, 4, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 7 };
 
 /*
@@ -125,7 +128,7 @@ static int amd74xx_get_info(char *buffer, char **addr, off_t offset, int count)
 
 	pci_read_config_byte(dev, PCI_REVISION_ID, &t);
 	amd_print("Revision:                           IDE %#x", t);
-	amd_print("Highest DMA rate:                   UDMA%s", amd_dma[fls(amd_config->udma_mask) - 1]);
+	amd_print("Highest DMA rate:                   %s", amd_dma[amd_config->flags & AMD_UDMA]);
 
 	amd_print("BM-DMA base:                        %#lx", amd_base);
 	amd_print("PCI clock:                          %d.%dMHz", amd_clock / 1000, amd_clock / 100 % 10);
@@ -218,12 +221,12 @@ static void amd_set_speed(struct pci_dev *dev, unsigned char dn, struct ide_timi
 	pci_write_config_byte(dev, AMD_DRIVE_TIMING + (3 - dn),
 		((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1));
 
-	switch (amd_config->udma_mask) {
-	case ATA_UDMA2: t = timing->udma ? (0xc0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
-	case ATA_UDMA4: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 2, 10)]) : 0x03; break;
-	case ATA_UDMA5: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 10)]) : 0x03; break;
-	case ATA_UDMA6: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 15)]) : 0x03; break;
-	default: return;
+	switch (amd_config->flags & AMD_UDMA) {
+		case AMD_UDMA_33:  t = timing->udma ? (0xc0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
+		case AMD_UDMA_66:  t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 2, 10)]) : 0x03; break;
+		case AMD_UDMA_100: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 10)]) : 0x03; break;
+		case AMD_UDMA_133: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 15)]) : 0x03; break;
+		default: return;
 	}
 
 	pci_write_config_byte(dev, AMD_UDMA_TIMING + (3 - dn), t);
@@ -245,7 +248,7 @@ static int amd_set_drive(ide_drive_t *drive, u8 speed)
 		ide_config_drive_speed(drive, speed);
 
 	T = 1000000000 / amd_clock;
-	UT = (amd_config->udma_mask == ATA_UDMA2) ? T : (T / 2);
+	UT = T / min_t(int, max_t(int, amd_config->flags & AMD_UDMA, 1), 2);
 
 	ide_timing_compute(drive, speed, &t, T, UT);
 
@@ -274,19 +277,29 @@ static int amd_set_drive(ide_drive_t *drive, u8 speed)
 static void amd74xx_tune_drive(ide_drive_t *drive, u8 pio)
 {
 	if (pio == 255) {
-		amd_set_drive(drive, ide_find_best_pio_mode(drive));
+		amd_set_drive(drive, ide_find_best_mode(drive, XFER_PIO | XFER_EPIO));
 		return;
 	}
 
 	amd_set_drive(drive, XFER_PIO_0 + min_t(byte, pio, 5));
 }
 
+/*
+ * amd74xx_dmaproc() is a callback from upper layers that can do
+ * a lot, but we use it for DMA/PIO tuning only, delegating everything
+ * else to the default ide_dmaproc().
+ */
+
 static int amd74xx_ide_dma_check(ide_drive_t *drive)
 {
-	u8 speed = ide_max_dma_mode(drive);
+	int w80 = HWIF(drive)->udma_four;
 
-	if (speed == 0)
-		speed = ide_find_best_pio_mode(drive);
+	u8 speed = ide_find_best_mode(drive,
+		XFER_PIO | XFER_EPIO | XFER_MWDMA | XFER_UDMA |
+		((amd_config->flags & AMD_BAD_SWDMA) ? 0 : XFER_SWDMA) |
+		(w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_66 ? XFER_UDMA_66 : 0) |
+		(w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_100 ? XFER_UDMA_100 : 0) |
+		(w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_133 ? XFER_UDMA_133 : 0));
 
 	amd_set_drive(drive, speed);
 
@@ -321,10 +334,10 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
  * Check 80-wire cable presence.
  */
 
-	switch (amd_config->udma_mask) {
+	switch (amd_config->flags & AMD_UDMA) {
 
-		case ATA_UDMA6:
-		case ATA_UDMA5:
+		case AMD_UDMA_133:
+		case AMD_UDMA_100:
 			pci_read_config_byte(dev, AMD_CABLE_DETECT, &t);
 			pci_read_config_dword(dev, AMD_UDMA_TIMING, &u);
 			amd_80w = ((t & 0x3) ? 1 : 0) | ((t & 0xc) ? 2 : 0);
@@ -336,7 +349,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
 				}
 			break;
 
-		case ATA_UDMA4:
+		case AMD_UDMA_66:
 			/* no host side cable detection */
 			amd_80w = 0x03;
 			break;
@@ -357,7 +370,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
 	if ((amd_config->flags & AMD_CHECK_SERENADE) &&
 		dev->subsystem_vendor == PCI_VENDOR_ID_AMD &&
 		dev->subsystem_device == PCI_DEVICE_ID_AMD_SERENADE)
-			amd_config->udma_mask = ATA_UDMA5;
+			amd_config->flags = AMD_UDMA_100;
 
 /*
  * Determine the system bus clock.
@@ -382,9 +395,8 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
  */
 
 	pci_read_config_byte(dev, PCI_REVISION_ID, &t);
-	printk(KERN_INFO "%s: %s (rev %02x) UDMA%s controller\n",
-		amd_chipset->name, pci_name(dev), t,
-		amd_dma[fls(amd_config->udma_mask) - 1]);
+	printk(KERN_INFO "%s: %s (rev %02x) %s controller\n",
+		amd_chipset->name, pci_name(dev), t, amd_dma[amd_config->flags & AMD_UDMA]);
 
 /*
  * Register /proc/ide/amd74xx entry
@@ -425,19 +437,12 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif)
 		return;
 
         hwif->atapi_dma = 1;
+        hwif->ultra_mask = 0x7f;
+        hwif->mwdma_mask = 0x07;
+        hwif->swdma_mask = 0x07;
 
-	hwif->ultra_mask = amd_config->udma_mask;
-	hwif->mwdma_mask = 0x07;
-	if ((amd_config->flags & AMD_BAD_SWDMA) == 0)
-		hwif->swdma_mask = 0x07;
-
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT) {
-		if ((amd_80w >> hwif->channel) & 1)
-			hwif->cbl = ATA_CBL_PATA80;
-		else
-			hwif->cbl = ATA_CBL_PATA40;
-	}
-
+	if (!hwif->udma_four)
+		hwif->udma_four = (amd_80w >> hwif->channel) & 1;
         hwif->ide_dma_check = &amd74xx_ide_dma_check;
         if (!noautodma)
                 hwif->autodma = 1;
diff --git a/trunk/drivers/ide/pci/atiixp.c b/trunk/drivers/ide/pci/atiixp.c
index 2761510309b3..8ab33faf6f76 100644
--- a/trunk/drivers/ide/pci/atiixp.c
+++ b/trunk/drivers/ide/pci/atiixp.c
@@ -264,11 +264,10 @@ static void __devinit init_hwif_atiixp(ide_hwif_t *hwif)
 	hwif->swdma_mask = 0x04;
 
 	pci_read_config_byte(pdev, ATIIXP_IDE_UDMA_MODE + ch, &udma_mode);
-
 	if ((udma_mode & 0x07) >= 0x04 || (udma_mode & 0x70) >= 0x40)
-		hwif->cbl = ATA_CBL_PATA80;
+		hwif->udma_four = 1;
 	else
-		hwif->cbl = ATA_CBL_PATA40;
+		hwif->udma_four = 0;
 
 	hwif->dma_host_on = &atiixp_dma_host_on;
 	hwif->dma_host_off = &atiixp_dma_host_off;
diff --git a/trunk/drivers/ide/pci/cmd64x.c b/trunk/drivers/ide/pci/cmd64x.c
index 8631b6c8aa15..7c57dc696f52 100644
--- a/trunk/drivers/ide/pci/cmd64x.c
+++ b/trunk/drivers/ide/pci/cmd64x.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/cmd64x.c		Version 1.50	May 10, 2007
+ * linux/drivers/ide/pci/cmd64x.c		Version 1.47	Mar 19, 2007
  *
  * cmd64x.c: Enable interrupts at initialization time on Ultra/PCI machines.
  *           Due to massive hardware bugs, UltraDMA is only supported
@@ -52,6 +52,9 @@
 #define   ARTTIM23_DIS_RA2	0x04
 #define   ARTTIM23_DIS_RA3	0x08
 #define   ARTTIM23_INTR_CH1	0x10
+#define ARTTIM2		0x57
+#define ARTTIM3		0x57
+#define DRWTIM23	0x58
 #define DRWTIM2		0x58
 #define BRST		0x59
 #define DRWTIM3		0x5b
@@ -466,43 +469,71 @@ static int cmd646_1_ide_dma_end (ide_drive_t *drive)
 
 static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const char *name)
 {
+	u32 class_rev = 0;
 	u8 mrdmode = 0;
 
-	if (dev->device == PCI_DEVICE_ID_CMD_646) {
-		u8 rev = 0;
+	pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev);
+	class_rev &= 0xff;
 
-		pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
-
-		switch (rev) {
-		case 0x07:
-		case 0x05:
-			printk("%s: UltraDMA capable", name);
+	switch(dev->device) {
+		case PCI_DEVICE_ID_CMD_643:
 			break;
-		case 0x03:
-		default:
-			printk("%s: MultiWord DMA force limited", name);
+		case PCI_DEVICE_ID_CMD_646:
+			printk(KERN_INFO "%s: chipset revision 0x%02X, ", name, class_rev);
+			switch(class_rev) {
+				case 0x07:
+				case 0x05:
+					printk("UltraDMA Capable");
+					break;
+				case 0x03:
+					printk("MultiWord DMA Force Limited");
+					break;
+				case 0x01:
+				default:
+					printk("MultiWord DMA Limited, IRQ workaround enabled");
+					break;
+				}
+			printk("\n");
+                        break;
+		case PCI_DEVICE_ID_CMD_648:
+		case PCI_DEVICE_ID_CMD_649:
 			break;
-		case 0x01:
-			printk("%s: MultiWord DMA limited, "
-			       "IRQ workaround enabled\n", name);
+		default:
 			break;
-		}
 	}
 
 	/* Set a good latency timer and cache line size value. */
 	(void) pci_write_config_byte(dev, PCI_LATENCY_TIMER, 64);
 	/* FIXME: pci_set_master() to ensure a good latency timer value */
 
-	/*
-	 * Enable interrupts, select MEMORY READ LINE for reads.
-	 *
-	 * NOTE: although not mentioned in the PCI0646U specs,
-	 * bits 0-1 are write only and won't be read back as
-	 * set or not -- PCI0646U2 specs clarify this point.
+	/* Setup interrupts. */
+	(void) pci_read_config_byte(dev, MRDMODE, &mrdmode);
+	mrdmode &= ~(0x30);
+	(void) pci_write_config_byte(dev, MRDMODE, mrdmode);
+
+	/* Use MEMORY READ LINE for reads.
+	 * NOTE: Although not mentioned in the PCI0646U specs,
+	 *       these bits are write only and won't be read
+	 *       back as set or not.  The PCI0646U2 specs clarify
+	 *       this point.
 	 */
-	(void) pci_read_config_byte (dev, MRDMODE, &mrdmode);
-	mrdmode &= ~0x30;
-	(void) pci_write_config_byte(dev, MRDMODE, (mrdmode | 0x02));
+	(void) pci_write_config_byte(dev, MRDMODE, mrdmode | 0x02);
+
+	/* Set reasonable active/recovery/address-setup values. */
+	(void) pci_write_config_byte(dev, ARTTIM0,  0x40);
+	(void) pci_write_config_byte(dev, DRWTIM0,  0x3f);
+	(void) pci_write_config_byte(dev, ARTTIM1,  0x40);
+	(void) pci_write_config_byte(dev, DRWTIM1,  0x3f);
+#ifdef __i386__
+	(void) pci_write_config_byte(dev, ARTTIM23, 0x1c);
+#else
+	(void) pci_write_config_byte(dev, ARTTIM23, 0x5c);
+#endif
+	(void) pci_write_config_byte(dev, DRWTIM23, 0x3f);
+	(void) pci_write_config_byte(dev, DRWTIM3,  0x3f);
+#ifdef CONFIG_PPC
+	(void) pci_write_config_byte(dev, UDIDETCR0, 0xf0);
+#endif /* CONFIG_PPC */
 
 #if defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
 
@@ -517,27 +548,29 @@ static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const cha
 	return 0;
 }
 
-static u8 __devinit ata66_cmd64x(ide_hwif_t *hwif)
+static unsigned int __devinit ata66_cmd64x(ide_hwif_t *hwif)
 {
-	struct pci_dev  *dev	= hwif->pci_dev;
-	u8 bmidecsr = 0, mask	= hwif->channel ? 0x02 : 0x01;
+	u8 ata66 = 0, mask = (hwif->channel) ? 0x02 : 0x01;
 
-	switch (dev->device) {
-	case PCI_DEVICE_ID_CMD_648:
-	case PCI_DEVICE_ID_CMD_649:
- 		pci_read_config_byte(dev, BMIDECSR, &bmidecsr);
-		return (bmidecsr & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-	default:
-		return ATA_CBL_PATA40;
+	switch(hwif->pci_dev->device) {
+		case PCI_DEVICE_ID_CMD_643:
+		case PCI_DEVICE_ID_CMD_646:
+			return ata66;
+		default:
+			break;
 	}
+	pci_read_config_byte(hwif->pci_dev, BMIDECSR, &ata66);
+	return (ata66 & mask) ? 1 : 0;
 }
 
 static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
 {
 	struct pci_dev *dev	= hwif->pci_dev;
-	u8 rev			= 0;
+	unsigned int class_rev;
 
-	pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
+	hwif->autodma = 0;
+	pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev);
+	class_rev &= 0xff;
 
 	hwif->tuneproc  = &cmd64x_tune_drive;
 	hwif->speedproc = &cmd64x_tune_chipset;
@@ -547,8 +580,8 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
 	if (!hwif->dma_base)
 		return;
 
-	hwif->atapi_dma  = 1;
-	hwif->mwdma_mask = 0x07;
+	hwif->atapi_dma = 1;
+
 	hwif->ultra_mask = hwif->cds->udma_mask;
 
 	/*
@@ -563,15 +596,16 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
 	 *
 	 * So we only do UltraDMA on revision 0x05 and 0x07 chipsets.
 	 */
-	if (dev->device == PCI_DEVICE_ID_CMD_646 && rev < 5)
+	if (dev->device == PCI_DEVICE_ID_CMD_646 && class_rev < 5)
 		hwif->ultra_mask = 0x00;
 
-	hwif->ide_dma_check = &cmd64x_config_drive_for_dma;
+	hwif->mwdma_mask = 0x07;
 
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-		hwif->cbl = ata66_cmd64x(hwif);
+	hwif->ide_dma_check = &cmd64x_config_drive_for_dma;
+	if (!(hwif->udma_four))
+		hwif->udma_four = ata66_cmd64x(hwif);
 
-	switch (dev->device) {
+	switch(dev->device) {
 	case PCI_DEVICE_ID_CMD_648:
 	case PCI_DEVICE_ID_CMD_649:
 	alt_irq_bits:
@@ -580,10 +614,10 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
 		break;
 	case PCI_DEVICE_ID_CMD_646:
 		hwif->chipset = ide_cmd646;
-		if (rev == 0x01) {
+		if (class_rev == 0x01) {
 			hwif->ide_dma_end = &cmd646_1_ide_dma_end;
 			break;
-		} else if (rev >= 0x03)
+		} else if (class_rev >= 0x03)
 			goto alt_irq_bits;
 		/* fall thru */
 	default:
@@ -592,9 +626,11 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
 		break;
 	}
 
+
 	if (!noautodma)
 		hwif->autodma = 1;
-	hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma;
+	hwif->drives[0].autodma = hwif->autodma;
+	hwif->drives[1].autodma = hwif->autodma;
 }
 
 static int __devinit init_setup_cmd64x(struct pci_dev *dev, ide_pci_device_t *d)
diff --git a/trunk/drivers/ide/pci/cs5535.c b/trunk/drivers/ide/pci/cs5535.c
index 10f61f38243c..41925c47ef05 100644
--- a/trunk/drivers/ide/pci/cs5535.c
+++ b/trunk/drivers/ide/pci/cs5535.c
@@ -187,8 +187,7 @@ static u8 __devinit cs5535_cable_detect(struct pci_dev *dev)
 
 	/* if a 80 wire cable was detected */
 	pci_read_config_byte(dev, CS5535_CABLE_DETECT, &bit);
-
-	return (bit & 1) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
+	return (bit & 1);
 }
 
 /****
@@ -213,7 +212,8 @@ static void __devinit init_hwif_cs5535(ide_hwif_t *hwif)
 	hwif->ultra_mask = 0x1F;
 	hwif->mwdma_mask = 0x07;
 
-	hwif->cbl = cs5535_cable_detect(hwif->pci_dev);
+
+	hwif->udma_four = cs5535_cable_detect(hwif->pci_dev);
 
 	if (!noautodma)
 		hwif->autodma = 1;
diff --git a/trunk/drivers/ide/pci/hpt366.c b/trunk/drivers/ide/pci/hpt366.c
index 4b6bae8eee82..c33d0b0f11c9 100644
--- a/trunk/drivers/ide/pci/hpt366.c
+++ b/trunk/drivers/ide/pci/hpt366.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/hpt366.c		Version 1.10	Jun 29, 2007
+ * linux/drivers/ide/pci/hpt366.c		Version 1.06	Jun 27, 2007
  *
  * Copyright (C) 1999-2003		Andre Hedrick <andre@linux-ide.org>
  * Portions Copyright (C) 2001	        Sun Microsystems, Inc.
@@ -77,7 +77,7 @@
  *   since they may tamper with its fields
  * - prefix the driver startup messages with the real chip name
  * - claim the extra 240 bytes of I/O space for all chips
- * - optimize the UltraDMA filtering and the drive list lookup code
+ * - optimize the rate masking/filtering and the drive list lookup code
  * - use pci_get_slot() to get to the function 1 of HPT36x/374
  * - cache offset of the channel's misc. control registers (MCRs) being used
  *   throughout the driver
@@ -99,9 +99,9 @@
  *   stop duplicating it for each channel by storing the pointer in the pci_dev
  *   structure: first, at the init_setup stage, point it to a static "template"
  *   with only the chip type and its specific base DPLL frequency, the highest
- *   UltraDMA mode, and the chip settings table pointer filled,  then, at the
- *   init_chipset stage, allocate per-chip instance  and fill it with the rest
- *   of the necessary information
+ *   supported DMA mode, and the chip settings table pointer filled, then, at
+ *   the init_chipset stage, allocate per-chip instance  and fill it with the
+ *   rest of the necessary information
  * - get rid of the constant thresholds in the HPT37x PCI clock detection code,
  *   switch  to calculating  PCI clock frequency based on the chip's base DPLL
  *   frequency
@@ -112,7 +112,6 @@
  *   also fixing the interchanged 25/40 MHz PCI clock cases for HPT36x chips;
  *   unify HPT36x/37x timing setup code and the speedproc handlers by joining
  *   the register setting lists into the table indexed by the clock selected
- * - set the correct hwif->ultra_mask for each individual chip
  *	Sergei Shtylyov, <sshtylyov@ru.mvista.com> or <source@mvista.com>
  */
 
@@ -392,7 +391,7 @@ enum ata_clock {
 
 struct hpt_info {
 	u8 chip_type;		/* Chip type */
-	u8 max_ultra;		/* Max. UltraDMA mode allowed */
+	u8 max_mode;		/* Speeds allowed */
 	u8 dpll_clk;		/* DPLL clock in MHz */
 	u8 pci_clk;		/* PCI  clock in MHz */
 	u32 **settings; 	/* Chipset settings table */
@@ -431,77 +430,77 @@ static u32 *hpt37x_settings[NUM_ATA_CLOCKS] = {
 
 static struct hpt_info hpt36x __devinitdata = {
 	.chip_type	= HPT36x,
-	.max_ultra	= HPT366_ALLOW_ATA66_3 ? (HPT366_ALLOW_ATA66_4 ? 4 : 3) : 2,
+	.max_mode	= (HPT366_ALLOW_ATA66_4 || HPT366_ALLOW_ATA66_3) ? 2 : 1,
 	.dpll_clk	= 0,	/* no DPLL */
 	.settings	= hpt36x_settings
 };
 
 static struct hpt_info hpt370 __devinitdata = {
 	.chip_type	= HPT370,
-	.max_ultra	= HPT370_ALLOW_ATA100_5 ? 5 : 4,
+	.max_mode	= HPT370_ALLOW_ATA100_5 ? 3 : 2,
 	.dpll_clk	= 48,
 	.settings	= hpt37x_settings
 };
 
 static struct hpt_info hpt370a __devinitdata = {
 	.chip_type	= HPT370A,
-	.max_ultra	= HPT370_ALLOW_ATA100_5 ? 5 : 4,
+	.max_mode	= HPT370_ALLOW_ATA100_5 ? 3 : 2,
 	.dpll_clk	= 48,
 	.settings	= hpt37x_settings
 };
 
 static struct hpt_info hpt374 __devinitdata = {
 	.chip_type	= HPT374,
-	.max_ultra	= 5,
+	.max_mode	= 3,
 	.dpll_clk	= 48,
 	.settings	= hpt37x_settings
 };
 
 static struct hpt_info hpt372 __devinitdata = {
 	.chip_type	= HPT372,
-	.max_ultra	= HPT372_ALLOW_ATA133_6 ? 6 : 5,
+	.max_mode	= HPT372_ALLOW_ATA133_6 ? 4 : 3,
 	.dpll_clk	= 55,
 	.settings	= hpt37x_settings
 };
 
 static struct hpt_info hpt372a __devinitdata = {
 	.chip_type	= HPT372A,
-	.max_ultra	= HPT372_ALLOW_ATA133_6 ? 6 : 5,
+	.max_mode	= HPT372_ALLOW_ATA133_6 ? 4 : 3,
 	.dpll_clk	= 66,
 	.settings	= hpt37x_settings
 };
 
 static struct hpt_info hpt302 __devinitdata = {
 	.chip_type	= HPT302,
-	.max_ultra	= HPT372_ALLOW_ATA133_6 ? 6 : 5,
+	.max_mode	= HPT302_ALLOW_ATA133_6 ? 4 : 3,
 	.dpll_clk	= 66,
 	.settings	= hpt37x_settings
 };
 
 static struct hpt_info hpt371 __devinitdata = {
 	.chip_type	= HPT371,
-	.max_ultra	= HPT371_ALLOW_ATA133_6 ? 6 : 5,
+	.max_mode	= HPT371_ALLOW_ATA133_6 ? 4 : 3,
 	.dpll_clk	= 66,
 	.settings	= hpt37x_settings
 };
 
 static struct hpt_info hpt372n __devinitdata = {
 	.chip_type	= HPT372N,
-	.max_ultra	= HPT372_ALLOW_ATA133_6 ? 6 : 5,
+	.max_mode	= HPT372_ALLOW_ATA133_6 ? 4 : 3,
 	.dpll_clk	= 77,
 	.settings	= hpt37x_settings
 };
 
 static struct hpt_info hpt302n __devinitdata = {
 	.chip_type	= HPT302N,
-	.max_ultra	= HPT302_ALLOW_ATA133_6 ? 6 : 5,
+	.max_mode	= HPT302_ALLOW_ATA133_6 ? 4 : 3,
 	.dpll_clk	= 77,
 	.settings	= hpt37x_settings
 };
 
 static struct hpt_info hpt371n __devinitdata = {
 	.chip_type	= HPT371N,
-	.max_ultra	= HPT371_ALLOW_ATA133_6 ? 6 : 5,
+	.max_mode	= HPT371_ALLOW_ATA133_6 ? 4 : 3,
 	.dpll_clk	= 77,
 	.settings	= hpt37x_settings
 };
@@ -524,38 +523,53 @@ static int check_in_drive_list(ide_drive_t *drive, const char **list)
 static u8 hpt3xx_udma_filter(ide_drive_t *drive)
 {
 	struct hpt_info *info	= pci_get_drvdata(HWIF(drive)->pci_dev);
+	u8 chip_type		= info->chip_type;
+	u8 mode			= info->max_mode;
 	u8 mask;
 
-	switch (info->chip_type) {
-	case HPT370A:
-		if (!HPT370_ALLOW_ATA100_5 ||
-		    check_in_drive_list(drive, bad_ata100_5))
-			return 0x1f;
-		else
-			return 0x3f;
-	case HPT370:
-		if (!HPT370_ALLOW_ATA100_5 ||
-		    check_in_drive_list(drive, bad_ata100_5))
-			mask = 0x1f;
-		else
+	switch (mode) {
+		case 0x04:
+			mask = 0x7f;
+			break;
+		case 0x03:
 			mask = 0x3f;
-		break;
-	case HPT36x:
-		if (!HPT366_ALLOW_ATA66_4 ||
-		    check_in_drive_list(drive, bad_ata66_4))
-			mask = 0x0f;
-		else
+			if (chip_type >= HPT374)
+				break;
+			if (!check_in_drive_list(drive, bad_ata100_5))
+				goto check_bad_ata33;
+			/* fall thru */
+		case 0x02:
 			mask = 0x1f;
 
-		if (!HPT366_ALLOW_ATA66_3 ||
-		    check_in_drive_list(drive, bad_ata66_3))
+			/*
+			 * CHECK ME, Does this need to be changed to HPT374 ??
+			 */
+			if (chip_type >= HPT370)
+				goto check_bad_ata33;
+			if (HPT366_ALLOW_ATA66_4 &&
+			    !check_in_drive_list(drive, bad_ata66_4))
+				goto check_bad_ata33;
+
+			mask = 0x0f;
+			if (HPT366_ALLOW_ATA66_3 &&
+			    !check_in_drive_list(drive, bad_ata66_3))
+				goto check_bad_ata33;
+			/* fall thru */
+		case 0x01:
 			mask = 0x07;
-		break;
-	default:
-		return 0x7f;
-	}
 
-	return check_in_drive_list(drive, bad_ata33) ? 0x00 : mask;
+		check_bad_ata33:
+			if (chip_type >= HPT370A)
+				break;
+			if (!check_in_drive_list(drive, bad_ata33))
+				break;
+			/* fall thru */
+		case 0x00:
+		default:
+			mask = 0x00;
+			break;
+	}
+	return mask;
 }
 
 static u32 get_speed_setting(u8 speed, struct hpt_info *info)
@@ -723,7 +737,7 @@ static int hpt366_config_drive_xfer_rate(ide_drive_t *drive)
  * This is specific to the HPT366 UDMA chipset
  * by HighPoint|Triones Technologies, Inc.
  */
-static void hpt366_dma_lost_irq(ide_drive_t *drive)
+static int hpt366_ide_dma_lostirq(ide_drive_t *drive)
 {
 	struct pci_dev *dev = HWIF(drive)->pci_dev;
 	u8 mcr1 = 0, mcr3 = 0, scr1 = 0;
@@ -735,7 +749,7 @@ static void hpt366_dma_lost_irq(ide_drive_t *drive)
 		drive->name, __FUNCTION__, mcr1, mcr3, scr1);
 	if (scr1 & 0x10)
 		pci_write_config_byte(dev, 0x5a, scr1 & ~0x10);
-	ide_dma_lost_irq(drive);
+	return __ide_dma_lostirq(drive);
 }
 
 static void hpt370_clear_engine(ide_drive_t *drive)
@@ -785,10 +799,10 @@ static int hpt370_ide_dma_end(ide_drive_t *drive)
 	return __ide_dma_end(drive);
 }
 
-static void hpt370_dma_timeout(ide_drive_t *drive)
+static int hpt370_ide_dma_timeout(ide_drive_t *drive)
 {
 	hpt370_irq_timeout(drive);
-	ide_dma_timeout(drive);
+	return __ide_dma_timeout(drive);
 }
 
 /* returns 1 if DMA IRQ issued, 0 otherwise */
@@ -1136,7 +1150,7 @@ static unsigned int __devinit init_chipset_hpt366(struct pci_dev *dev, const cha
 		  * Select 66 MHz DPLL clock only if UltraATA/133 mode is
 		  * supported/enabled, use 50 MHz DPLL clock otherwise...
 		  */
-		if (info->max_ultra == 6) {
+		if (info->max_mode == 0x04) {
 			dpll_clk = 66;
 			clock = ATA_CLOCK_66MHZ;
 		} else if (dpll_clk) {	/* HPT36x chips don't have DPLL */
@@ -1229,7 +1243,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
 	struct pci_dev	*dev		= hwif->pci_dev;
 	struct hpt_info *info		= pci_get_drvdata(dev);
 	int serialize			= HPT_SERIALIZE_IO;
-	u8  scr1 = 0, ata66		= hwif->channel ? 0x01 : 0x02;
+	u8  scr1 = 0, ata66		= (hwif->channel) ? 0x01 : 0x02;
 	u8  chip_type			= info->chip_type;
 	u8  new_mcr, old_mcr 		= 0;
 
@@ -1242,9 +1256,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
 	hwif->intrproc			= &hpt3xx_intrproc;
 	hwif->maskproc			= &hpt3xx_maskproc;
 	hwif->busproc			= &hpt3xx_busproc;
-
-	if (chip_type <= HPT370A)
-		hwif->udma_filter	= &hpt3xx_udma_filter;
+	hwif->udma_filter		= &hpt3xx_udma_filter;
 
 	/*
 	 * HPT3xxN chips have some complications:
@@ -1293,7 +1305,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
 		return;
 	}
 
-	hwif->ultra_mask = hwif->cds->udma_mask;
+	hwif->ultra_mask = 0x7f;
 	hwif->mwdma_mask = 0x07;
 
 	/*
@@ -1330,8 +1342,8 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
 	} else
 		pci_read_config_byte (dev, 0x5a, &scr1);
 
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-		hwif->cbl = (scr1 & ata66) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
+	if (!hwif->udma_four)
+		hwif->udma_four = (scr1 & ata66) ? 0 : 1;
 
 	hwif->ide_dma_check		= &hpt366_config_drive_xfer_rate;
 
@@ -1341,9 +1353,9 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
 	} else if (chip_type >= HPT370) {
 		hwif->dma_start 	= &hpt370_ide_dma_start;
 		hwif->ide_dma_end	= &hpt370_ide_dma_end;
-		hwif->dma_timeout	= &hpt370_dma_timeout;
+		hwif->ide_dma_timeout	= &hpt370_ide_dma_timeout;
 	} else
-		hwif->dma_lost_irq	= &hpt366_dma_lost_irq;
+		hwif->ide_dma_lostirq	= &hpt366_ide_dma_lostirq;
 
 	if (!noautodma)
 		hwif->autodma = 1;
@@ -1491,35 +1503,9 @@ static int __devinit init_setup_hpt366(struct pci_dev *dev, ide_pci_device_t *d)
 
 	pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
 
-	switch (rev) {
-	case 0:
-	case 1:
-	case 2:
-		/*
-		 * HPT36x chips have one channel per function and have
-		 * both channel enable bits located differently and visible
-		 * to both functions -- really stupid design decision... :-(
-		 * Bit 4 is for the primary channel, bit 5 for the secondary.
-		 */
-		d->channels = 1;
-		d->enablebits[0].mask = d->enablebits[0].val = 0x10;
-
-		d->udma_mask = HPT366_ALLOW_ATA66_3 ?
-			      (HPT366_ALLOW_ATA66_4 ? 0x1f : 0x0f) : 0x07;
-		break;
-	case 3:
-	case 4:
-		d->udma_mask = HPT370_ALLOW_ATA100_5 ? 0x3f : 0x1f;
-		break;
-	default:
+	if (rev > 6)
 		rev = 6;
-		/* fall thru */
-	case 5:
-	case 6:
-		d->udma_mask = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f;
-		break;
-	}
-
+		
 	d->name = chipset_names[rev];
 
 	pci_set_drvdata(dev, info[rev]);
@@ -1527,6 +1513,15 @@ static int __devinit init_setup_hpt366(struct pci_dev *dev, ide_pci_device_t *d)
 	if (rev > 2)
 		goto init_single;
 
+	/*
+	 * HPT36x chips have one channel per function and have
+	 * both channel enable bits located differently and visible
+	 * to both functions -- really stupid design decision... :-(
+	 * Bit 4 is for the primary channel, bit 5 for the secondary.
+	 */
+	d->channels = 1;
+	d->enablebits[0].mask = d->enablebits[0].val = 0x10;
+
 	if ((dev2 = pci_get_slot(dev->bus, dev->devfn + 1)) != NULL) {
 		u8  mcr1 = 0, pin1 = 0, pin2 = 0;
 		int ret;
@@ -1578,7 +1573,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
-		.udma_mask	= HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
 		.bootable	= OFF_BOARD,
 		.extra		= 240
 	},{	/* 2 */
@@ -1590,7 +1584,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
-		.udma_mask	= HPT302_ALLOW_ATA133_6 ? 0x7f : 0x3f,
 		.bootable	= OFF_BOARD,
 		.extra		= 240
 	},{	/* 3 */
@@ -1602,7 +1595,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
-		.udma_mask	= HPT371_ALLOW_ATA133_6 ? 0x7f : 0x3f,
 		.bootable	= OFF_BOARD,
 		.extra		= 240
 	},{	/* 4 */
@@ -1614,7 +1606,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.channels	= 2,	/* 4 */
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
-		.udma_mask	= 0x3f,
 		.bootable	= OFF_BOARD,
 		.extra		= 240
 	},{	/* 5 */
@@ -1626,7 +1617,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.channels	= 2,	/* 4 */
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
-		.udma_mask	= HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
 		.bootable	= OFF_BOARD,
 		.extra		= 240
 	}
diff --git a/trunk/drivers/ide/pci/it8213.c b/trunk/drivers/ide/pci/it8213.c
index ff48c23e571e..c04a02687b95 100644
--- a/trunk/drivers/ide/pci/it8213.c
+++ b/trunk/drivers/ide/pci/it8213.c
@@ -231,7 +231,7 @@ static int it8213_config_drive_for_dma (ide_drive_t *drive)
 
 static void __devinit init_hwif_it8213(ide_hwif_t *hwif)
 {
-	u8 reg42h = 0;
+	u8 reg42h = 0, ata66 = 0;
 
 	hwif->speedproc = &it8213_tune_chipset;
 	hwif->tuneproc	= &it8213_tuneproc;
@@ -250,11 +250,11 @@ static void __devinit init_hwif_it8213(ide_hwif_t *hwif)
 	hwif->swdma_mask = 0x04;
 
 	pci_read_config_byte(hwif->pci_dev, 0x42, &reg42h);
+	ata66 = (reg42h & 0x02) ? 0 : 1;
 
 	hwif->ide_dma_check = &it8213_config_drive_for_dma;
-
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-		hwif->cbl = (reg42h & 0x02) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
+	if (!(hwif->udma_four))
+		hwif->udma_four = ata66;
 
 	/*
 	 *	The BIOS often doesn't set up DMA on this controller
diff --git a/trunk/drivers/ide/pci/it821x.c b/trunk/drivers/ide/pci/it821x.c
index 8197b653ba1e..3aeb7f1b7916 100644
--- a/trunk/drivers/ide/pci/it821x.c
+++ b/trunk/drivers/ide/pci/it821x.c
@@ -491,10 +491,10 @@ static int it821x_config_drive_for_dma (ide_drive_t *drive)
  *	the needed logic onboard.
  */
 
-static u8 __devinit ata66_it821x(ide_hwif_t *hwif)
+static unsigned int __devinit ata66_it821x(ide_hwif_t *hwif)
 {
 	/* The reference driver also only does disk side */
-	return ATA_CBL_PATA80;
+	return 1;
 }
 
 /**
@@ -662,9 +662,8 @@ static void __devinit init_hwif_it821x(ide_hwif_t *hwif)
 	hwif->mwdma_mask = 0x07;
 
 	hwif->ide_dma_check = &it821x_config_drive_for_dma;
-
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-		hwif->cbl = ata66_it821x(hwif);
+	if (!(hwif->udma_four))
+		hwif->udma_four = ata66_it821x(hwif);
 
 	/*
 	 *	The BIOS often doesn't set up DMA on this controller
diff --git a/trunk/drivers/ide/pci/jmicron.c b/trunk/drivers/ide/pci/jmicron.c
index a6008f63e71e..76ed25147229 100644
--- a/trunk/drivers/ide/pci/jmicron.c
+++ b/trunk/drivers/ide/pci/jmicron.c
@@ -25,10 +25,10 @@ typedef enum {
  *	ata66_jmicron		-	Cable check
  *	@hwif: IDE port
  *
- *	Returns the cable type.
+ *	Return 1 if the cable is 80pin
  */
 
-static u8 __devinit ata66_jmicron(ide_hwif_t *hwif)
+static int __devinit ata66_jmicron(ide_hwif_t *hwif)
 {
 	struct pci_dev *pdev = hwif->pci_dev;
 
@@ -70,17 +70,16 @@ static u8 __devinit ata66_jmicron(ide_hwif_t *hwif)
 	{
 	case PORT_PATA0:
 		if (control & (1 << 3))	/* 40/80 pin primary */
-			return ATA_CBL_PATA40;
-		return ATA_CBL_PATA80;
+			return 0;
+		return 1;
 	case PORT_PATA1:
 		if (control5 & (1 << 19))	/* 40/80 pin secondary */
-			return ATA_CBL_PATA40;
-		return ATA_CBL_PATA80;
+			return 0;
+		return 1;
 	case PORT_SATA:
 		break;
 	}
-	/* Avoid bogus "control reaches end of non-void function" */
-	return ATA_CBL_PATA80;
+	return 1; /* Avoid bogus "control reaches end of non-void function" */
 }
 
 static void jmicron_tuneproc (ide_drive_t *drive, byte mode_wanted)
@@ -160,9 +159,8 @@ static void __devinit init_hwif_jmicron(ide_hwif_t *hwif)
 	hwif->mwdma_mask = 0x07;
 
 	hwif->ide_dma_check = &jmicron_config_drive_for_dma;
-
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-		hwif->cbl = ata66_jmicron(hwif);
+	if (!(hwif->udma_four))
+		hwif->udma_four = ata66_jmicron(hwif);
 
 	hwif->autodma = 1;
 	hwif->drives[0].autodma = hwif->autodma;
diff --git a/trunk/drivers/ide/pci/pdc202xx_new.c b/trunk/drivers/ide/pci/pdc202xx_new.c
index ee5020df005d..0765dce6948e 100644
--- a/trunk/drivers/ide/pci/pdc202xx_new.c
+++ b/trunk/drivers/ide/pci/pdc202xx_new.c
@@ -225,10 +225,7 @@ static void pdcnew_tune_drive(ide_drive_t *drive, u8 pio)
 
 static u8 pdcnew_cable_detect(ide_hwif_t *hwif)
 {
-	if (get_indexed_reg(hwif, 0x0b) & 0x04)
-		return ATA_CBL_PATA40;
-	else
-		return ATA_CBL_PATA80;
+	return get_indexed_reg(hwif, 0x0b) & 0x04;
 }
 
 static int pdcnew_config_drive_xfer_rate(ide_drive_t *drive)
@@ -512,8 +509,8 @@ static void __devinit init_hwif_pdc202new(ide_hwif_t *hwif)
 
 	hwif->ide_dma_check = &pdcnew_config_drive_xfer_rate;
 
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-		hwif->cbl = pdcnew_cable_detect(hwif);
+	if (!hwif->udma_four)
+		hwif->udma_four = pdcnew_cable_detect(hwif) ? 0 : 1;
 
 	if (!noautodma)
 		hwif->autodma = 1;
diff --git a/trunk/drivers/ide/pci/pdc202xx_old.c b/trunk/drivers/ide/pci/pdc202xx_old.c
index 41ac4a94959f..23844687deea 100644
--- a/trunk/drivers/ide/pci/pdc202xx_old.c
+++ b/trunk/drivers/ide/pci/pdc202xx_old.c
@@ -152,10 +152,8 @@ static void pdc202xx_tune_drive(ide_drive_t *drive, u8 pio)
 static u8 pdc202xx_old_cable_detect (ide_hwif_t *hwif)
 {
 	u16 CIS = 0, mask = (hwif->channel) ? (1<<11) : (1<<10);
-
 	pci_read_config_word(hwif->pci_dev, 0x50, &CIS);
-
-	return (CIS & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
+	return (CIS & mask) ? 1 : 0;
 }
 
 /*
@@ -269,24 +267,18 @@ static int pdc202xx_old_ide_dma_test_irq(ide_drive_t *drive)
 	return (dma_stat & 4) == 4;	/* return 1 if INTR asserted */
 }
 
-static void pdc202xx_dma_lost_irq(ide_drive_t *drive)
+static int pdc202xx_ide_dma_lostirq(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = HWIF(drive);
-
-	if (hwif->resetproc != NULL)
-		hwif->resetproc(drive);
-
-	ide_dma_lost_irq(drive);
+	if (HWIF(drive)->resetproc != NULL)
+		HWIF(drive)->resetproc(drive);
+	return __ide_dma_lostirq(drive);
 }
 
-static void pdc202xx_dma_timeout(ide_drive_t *drive)
+static int pdc202xx_ide_dma_timeout(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = HWIF(drive);
-
-	if (hwif->resetproc != NULL)
-		hwif->resetproc(drive);
-
-	ide_dma_timeout(drive);
+	if (HWIF(drive)->resetproc != NULL)
+		HWIF(drive)->resetproc(drive);
+	return __ide_dma_timeout(drive);
 }
 
 static void pdc202xx_reset_host (ide_hwif_t *hwif)
@@ -355,13 +347,12 @@ static void __devinit init_hwif_pdc202xx(ide_hwif_t *hwif)
 	hwif->err_stops_fifo = 1;
 
 	hwif->ide_dma_check = &pdc202xx_config_drive_xfer_rate;
-	hwif->dma_lost_irq = &pdc202xx_dma_lost_irq;
-	hwif->dma_timeout = &pdc202xx_dma_timeout;
+	hwif->ide_dma_lostirq = &pdc202xx_ide_dma_lostirq;
+	hwif->ide_dma_timeout = &pdc202xx_ide_dma_timeout;
 
 	if (hwif->pci_dev->device != PCI_DEVICE_ID_PROMISE_20246) {
-		if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-			hwif->cbl = pdc202xx_old_cable_detect(hwif);
-
+		if (!(hwif->udma_four))
+			hwif->udma_four = (pdc202xx_old_cable_detect(hwif)) ? 0 : 1;
 		hwif->dma_start = &pdc202xx_old_ide_dma_start;
 		hwif->ide_dma_end = &pdc202xx_old_ide_dma_end;
 	} 
diff --git a/trunk/drivers/ide/pci/piix.c b/trunk/drivers/ide/pci/piix.c
index 2e0b29ef596a..8b219dd63024 100644
--- a/trunk/drivers/ide/pci/piix.c
+++ b/trunk/drivers/ide/pci/piix.c
@@ -1,5 +1,5 @@
 /*
- *  linux/drivers/ide/pci/piix.c	Version 0.50	Jun 10, 2007
+ *  linux/drivers/ide/pci/piix.c	Version 0.47	February 8, 2007
  *
  *  Copyright (C) 1998-1999 Andrzej Krzysztofowicz, Author and Maintainer
  *  Copyright (C) 1998-2000 Andre Hedrick <andre@linux-ide.org>
@@ -394,45 +394,14 @@ static void piix_dma_clear_irq(ide_drive_t *drive)
 	hwif->OUTB(dma_stat, hwif->dma_status);
 }
 
-struct ich_laptop {
-	u16 device;
-	u16 subvendor;
-	u16 subdevice;
-};
-
-/*
- *	List of laptops that use short cables rather than 80 wire
- */
-
-static const struct ich_laptop ich_laptop[] = {
-	/* devid, subvendor, subdev */
-	{ 0x27DF, 0x0005, 0x0280 },	/* ICH7 on Acer 5602WLMi */
-	{ 0x27DF, 0x1025, 0x0110 },	/* ICH7 on Acer 3682WLMi */
-	{ 0x27DF, 0x1043, 0x1267 },	/* ICH7 on Asus W5F */
-	{ 0x24CA, 0x1025, 0x0061 },	/* ICH4 on Acer Aspire 2023WLMi */
-	/* end marker */
-	{ 0, }
-};
-
-static u8 __devinit piix_cable_detect(ide_hwif_t *hwif)
+static int __devinit piix_cable_detect(ide_hwif_t *hwif)
 {
-	struct pci_dev *pdev = hwif->pci_dev;
-	const struct ich_laptop *lap = &ich_laptop[0];
+	struct pci_dev *dev = hwif->pci_dev;
 	u8 reg54h = 0, mask = hwif->channel ? 0xc0 : 0x30;
 
-	/* check for specials */
-	while (lap->device) {
-		if (lap->device == pdev->device &&
-		    lap->subvendor == pdev->subsystem_vendor &&
-		    lap->subdevice == pdev->subsystem_device) {
-			return ATA_CBL_PATA40_SHORT;
-		}
-		lap++;
-	}
-
-	pci_read_config_byte(pdev, 0x54, &reg54h);
+	pci_read_config_byte(dev, 0x54, &reg54h);
 
-	return (reg54h & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
+	return (reg54h & mask) ? 1 : 0;
 }
 
 /**
@@ -475,8 +444,8 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif)
 	hwif->swdma_mask = 0x04;
 
 	if (hwif->ultra_mask & 0x78) {
-		if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-			hwif->cbl = piix_cable_detect(hwif);
+		if (!hwif->udma_four)
+			hwif->udma_four = piix_cable_detect(hwif);
 	}
 
 	if (no_piix_dma)
diff --git a/trunk/drivers/ide/pci/scc_pata.c b/trunk/drivers/ide/pci/scc_pata.c
index 7b87488e3daa..55bc0a32e34f 100644
--- a/trunk/drivers/ide/pci/scc_pata.c
+++ b/trunk/drivers/ide/pci/scc_pata.c
@@ -716,7 +716,7 @@ static void __devinit init_hwif_scc(ide_hwif_t *hwif)
 	hwif->atapi_dma = 1;
 
 	/* we support 80c cable only. */
-	hwif->cbl = ATA_CBL_PATA80;
+	hwif->udma_four = 1;
 
 	hwif->autodma = 0;
 	if (!noautodma)
diff --git a/trunk/drivers/ide/pci/serverworks.c b/trunk/drivers/ide/pci/serverworks.c
index 1371b5bf6bf0..d9c4fd1ae996 100644
--- a/trunk/drivers/ide/pci/serverworks.c
+++ b/trunk/drivers/ide/pci/serverworks.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/serverworks.c		Version 0.20	Jun 3 2007
+ * linux/drivers/ide/pci/serverworks.c		Version 0.11	Jun 2 2007
  *
  * Copyright (C) 1998-2000 Michel Aubry
  * Copyright (C) 1998-2000 Andrzej Krzysztofowicz
@@ -151,11 +151,84 @@ static int svwks_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	if(dev->device == PCI_DEVICE_ID_SERVERWORKS_OSB4 &&
 		drive->media == ide_disk && speed >= XFER_UDMA_0)
 			BUG();
-
+			
+	pci_read_config_byte(dev, drive_pci[drive->dn], &pio_timing);
+	pci_read_config_byte(dev, drive_pci2[drive->dn], &dma_timing);
 	pci_read_config_byte(dev, (0x56|hwif->channel), &ultra_timing);
 	pci_read_config_word(dev, 0x4A, &csb5_pio);
 	pci_read_config_byte(dev, 0x54, &ultra_enable);
 
+	/* If we are in RAID mode (eg AMI MegaIDE) then we can't it
+	   turns out trust the firmware configuration */
+
+	if ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE)
+		goto oem_setup_failed;
+
+	/* Per Specified Design by OEM, and ASIC Architect */
+	if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) ||
+	    (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2)) {
+		if (!drive->init_speed) {
+			u8 dma_stat = inb(hwif->dma_status);
+
+			if (((ultra_enable << (7-drive->dn) & 0x80) == 0x80) &&
+			    ((dma_stat & (1<<(5+unit))) == (1<<(5+unit)))) {
+				drive->current_speed = drive->init_speed = XFER_UDMA_0 + udma_modes[(ultra_timing >> (4*unit)) & ~(0xF0)];
+				return 0;
+			} else if ((dma_timing) &&
+				   ((dma_stat&(1<<(5+unit)))==(1<<(5+unit)))) {
+				u8 dmaspeed;
+
+				switch (dma_timing & 0x77) {
+				case 0x20:
+					dmaspeed = XFER_MW_DMA_2;
+					break;
+				case 0x21:
+					dmaspeed = XFER_MW_DMA_1;
+					break;
+				case 0x77:
+					dmaspeed = XFER_MW_DMA_0;
+					break;
+				default:
+					goto dma_pio;
+				}
+
+				drive->current_speed = drive->init_speed = dmaspeed;
+				return 0;
+			}
+dma_pio:
+			if (pio_timing) {
+				u8 piospeed;
+
+				switch (pio_timing & 0x7f) {
+				case 0x20:
+					piospeed = XFER_PIO_4;
+					break;
+				case 0x22:
+					piospeed = XFER_PIO_3;
+					break;
+				case 0x34:
+					piospeed = XFER_PIO_2;
+					break;
+				case 0x47:
+					piospeed = XFER_PIO_1;
+					break;
+				case 0x5d:
+					piospeed = XFER_PIO_0;
+					break;
+				default:
+					goto oem_setup_failed;
+				}
+
+				drive->current_speed = drive->init_speed = piospeed;
+				return 0;
+			}
+		}
+	}
+
+oem_setup_failed:
+
+	pio_timing	= 0;
+	dma_timing	= 0;
 	ultra_timing	&= ~(0x0F << (4*unit));
 	ultra_enable	&= ~(0x01 << drive->dn);
 	csb5_pio	&= ~(0x0F << (4*drive->dn));
@@ -329,9 +402,9 @@ static unsigned int __devinit init_chipset_svwks (struct pci_dev *dev, const cha
 	return dev->irq;
 }
 
-static u8 __devinit ata66_svwks_svwks(ide_hwif_t *hwif)
+static unsigned int __devinit ata66_svwks_svwks (ide_hwif_t *hwif)
 {
-	return ATA_CBL_PATA80;
+	return 1;
 }
 
 /* On Dell PowerEdge servers with a CSB5/CSB6, the top two bits
@@ -341,7 +414,7 @@ static u8 __devinit ata66_svwks_svwks(ide_hwif_t *hwif)
  * Bit 14 clear = primary IDE channel does not have 80-pin cable.
  * Bit 14 set   = primary IDE channel has 80-pin cable.
  */
-static u8 __devinit ata66_svwks_dell(ide_hwif_t *hwif)
+static unsigned int __devinit ata66_svwks_dell (ide_hwif_t *hwif)
 {
 	struct pci_dev *dev = hwif->pci_dev;
 	if (dev->subsystem_vendor == PCI_VENDOR_ID_DELL &&
@@ -349,8 +422,8 @@ static u8 __devinit ata66_svwks_dell(ide_hwif_t *hwif)
 	    (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE ||
 	     dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE))
 		return ((1 << (hwif->channel + 14)) &
-			dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-	return ATA_CBL_PATA40;
+			dev->subsystem_device) ? 1 : 0;
+	return 0;
 }
 
 /* Sun Cobalt Alpine hardware avoids the 80-pin cable
@@ -359,18 +432,18 @@ static u8 __devinit ata66_svwks_dell(ide_hwif_t *hwif)
  *
  * WARNING: this only works on Alpine hardware!
  */
-static u8 __devinit ata66_svwks_cobalt(ide_hwif_t *hwif)
+static unsigned int __devinit ata66_svwks_cobalt (ide_hwif_t *hwif)
 {
 	struct pci_dev *dev = hwif->pci_dev;
 	if (dev->subsystem_vendor == PCI_VENDOR_ID_SUN &&
 	    dev->vendor	== PCI_VENDOR_ID_SERVERWORKS &&
 	    dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE)
 		return ((1 << (hwif->channel + 14)) &
-			dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-	return ATA_CBL_PATA40;
+			dev->subsystem_device) ? 1 : 0;
+	return 0;
 }
 
-static u8 __devinit ata66_svwks(ide_hwif_t *hwif)
+static unsigned int __devinit ata66_svwks (ide_hwif_t *hwif)
 {
 	struct pci_dev *dev = hwif->pci_dev;
 
@@ -389,9 +462,9 @@ static u8 __devinit ata66_svwks(ide_hwif_t *hwif)
 	/* Per Specified Design by OEM, and ASIC Architect */
 	if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) ||
 	    (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2))
-		return ATA_CBL_PATA80;
+		return 1;
 
-	return ATA_CBL_PATA40;
+	return 0;
 }
 
 static void __devinit init_hwif_svwks (ide_hwif_t *hwif)
@@ -422,8 +495,8 @@ static void __devinit init_hwif_svwks (ide_hwif_t *hwif)
 
 	hwif->ide_dma_check = &svwks_config_drive_xfer_rate;
 	if (hwif->pci_dev->device != PCI_DEVICE_ID_SERVERWORKS_OSB4IDE) {
-		if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-			hwif->cbl = ata66_svwks(hwif);
+		if (!hwif->udma_four)
+			hwif->udma_four = ata66_svwks(hwif);
 	}
 	if (!noautodma)
 		hwif->autodma = 1;
diff --git a/trunk/drivers/ide/pci/sgiioc4.c b/trunk/drivers/ide/pci/sgiioc4.c
index d396b2929ed8..d3185e29a38e 100644
--- a/trunk/drivers/ide/pci/sgiioc4.c
+++ b/trunk/drivers/ide/pci/sgiioc4.c
@@ -316,19 +316,19 @@ static void sgiioc4_dma_host_off(ide_drive_t * drive)
 	sgiioc4_clearirq(drive);
 }
 
-static void
-sgiioc4_resetproc(ide_drive_t * drive)
+static int
+sgiioc4_ide_dma_lostirq(ide_drive_t * drive)
 {
-	sgiioc4_ide_dma_end(drive);
-	sgiioc4_clearirq(drive);
+	HWIF(drive)->resetproc(drive);
+
+	return __ide_dma_lostirq(drive);
 }
 
 static void
-sgiioc4_dma_lost_irq(ide_drive_t * drive)
+sgiioc4_resetproc(ide_drive_t * drive)
 {
-	sgiioc4_resetproc(drive);
-
-	ide_dma_lost_irq(drive);
+	sgiioc4_ide_dma_end(drive);
+	sgiioc4_clearirq(drive);
 }
 
 static u8
@@ -607,8 +607,8 @@ ide_init_sgiioc4(ide_hwif_t * hwif)
 	hwif->ide_dma_test_irq = &sgiioc4_ide_dma_test_irq;
 	hwif->dma_host_on = &sgiioc4_dma_host_on;
 	hwif->dma_host_off = &sgiioc4_dma_host_off;
-	hwif->dma_lost_irq = &sgiioc4_dma_lost_irq;
-	hwif->dma_timeout = &ide_dma_timeout;
+	hwif->ide_dma_lostirq = &sgiioc4_ide_dma_lostirq;
+	hwif->ide_dma_timeout = &__ide_dma_timeout;
 
 	hwif->INB = &sgiioc4_INB;
 }
diff --git a/trunk/drivers/ide/pci/siimage.c b/trunk/drivers/ide/pci/siimage.c
index 1c3e35487893..1a4444e7226a 100644
--- a/trunk/drivers/ide/pci/siimage.c
+++ b/trunk/drivers/ide/pci/siimage.c
@@ -933,17 +933,16 @@ static void __devinit init_iops_siimage(ide_hwif_t *hwif)
  *	interface.
  */
 
-static u8 __devinit ata66_siimage(ide_hwif_t *hwif)
+static unsigned int __devinit ata66_siimage(ide_hwif_t *hwif)
 {
 	unsigned long addr = siimage_selreg(hwif, 0);
-	u8 ata66 = 0;
-
-	if (pci_get_drvdata(hwif->pci_dev) == NULL)
+	if (pci_get_drvdata(hwif->pci_dev) == NULL) {
+		u8 ata66 = 0;
 		pci_read_config_byte(hwif->pci_dev, addr, &ata66);
-	else
-		ata66 = hwif->INB(addr);
+		return (ata66 & 0x01) ? 1 : 0;
+	}
 
-	return (ata66 & 0x01) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
+	return (hwif->INB(addr) & 0x01) ? 1 : 0;
 }
 
 /**
@@ -989,9 +988,8 @@ static void __devinit init_hwif_siimage(ide_hwif_t *hwif)
 		hwif->atapi_dma = 1;
 
 	hwif->ide_dma_check = &siimage_config_drive_for_dma;
-
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-		hwif->cbl = ata66_siimage(hwif);
+	if (!(hwif->udma_four))
+		hwif->udma_four = ata66_siimage(hwif);
 
 	if (hwif->mmio) {
 		hwif->ide_dma_test_irq = &siimage_mmio_ide_dma_test_irq;
diff --git a/trunk/drivers/ide/pci/sis5513.c b/trunk/drivers/ide/pci/sis5513.c
index f875183ac8d9..ec0adad9ef61 100644
--- a/trunk/drivers/ide/pci/sis5513.c
+++ b/trunk/drivers/ide/pci/sis5513.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/sis5513.c	Version 0.25	Jun 10, 2007
+ * linux/drivers/ide/pci/sis5513.c	Version 0.20	Mar 4, 2007
  *
  * Copyright (C) 1999-2000	Andre Hedrick <andre@linux-ide.org>
  * Copyright (C) 2002		Lionel Bouton <Lionel.Bouton@inet6.fr>, Maintainer
@@ -796,33 +796,10 @@ static unsigned int __devinit init_chipset_sis5513 (struct pci_dev *dev, const c
 	return 0;
 }
 
-struct sis_laptop {
-	u16 device;
-	u16 subvendor;
-	u16 subdevice;
-};
-
-static const struct sis_laptop sis_laptop[] = {
-	/* devid, subvendor, subdev */
-	{ 0x5513, 0x1043, 0x1107 },	/* ASUS A6K */
-	/* end marker */
-	{ 0, }
-};
-
-static u8 __devinit ata66_sis5513(ide_hwif_t *hwif)
+static unsigned int __devinit ata66_sis5513 (ide_hwif_t *hwif)
 {
-	struct pci_dev *pdev = hwif->pci_dev;
-	const struct sis_laptop *lap = &sis_laptop[0];
 	u8 ata66 = 0;
 
-	while (lap->device) {
-		if (lap->device == pdev->device &&
-		    lap->subvendor == pdev->subsystem_vendor &&
-		    lap->subdevice == pdev->subsystem_device)
-			return ATA_CBL_PATA40_SHORT;
-		lap++;
-	}
-
 	if (chipset_family >= ATA_133) {
 		u16 regw = 0;
 		u16 reg_addr = hwif->channel ? 0x52: 0x50;
@@ -834,8 +811,7 @@ static u8 __devinit ata66_sis5513(ide_hwif_t *hwif)
 		pci_read_config_byte(hwif->pci_dev, 0x48, &reg48h);
 		ata66 = (reg48h & mask) ? 0 : 1;
 	}
-
-	return ata66 ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
+        return ata66;
 }
 
 static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif)
@@ -865,8 +841,8 @@ static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif)
 	if (!chipset_family)
 		return;
 
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-		hwif->cbl = ata66_sis5513(hwif);
+	if (!(hwif->udma_four))
+		hwif->udma_four = ata66_sis5513(hwif);
 
 	if (chipset_family > ATA_16) {
 		hwif->ide_dma_check = &sis5513_config_xfer_rate;
diff --git a/trunk/drivers/ide/pci/sl82c105.c b/trunk/drivers/ide/pci/sl82c105.c
index 487879842af4..7c383d9cc472 100644
--- a/trunk/drivers/ide/pci/sl82c105.c
+++ b/trunk/drivers/ide/pci/sl82c105.c
@@ -195,7 +195,7 @@ static inline void sl82c105_reset_host(struct pci_dev *dev)
  * This function is called when the IDE timer expires, the drive
  * indicates that it is READY, and we were waiting for DMA to complete.
  */
-static void sl82c105_dma_lost_irq(ide_drive_t *drive)
+static int sl82c105_ide_dma_lostirq(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
@@ -222,6 +222,9 @@ static void sl82c105_dma_lost_irq(ide_drive_t *drive)
 	}
 
 	sl82c105_reset_host(dev);
+
+	/* __ide_dma_lostirq would return 1, so we do as well */
+	return 1;
 }
 
 /*
@@ -241,12 +244,15 @@ static void sl82c105_dma_start(ide_drive_t *drive)
 	ide_dma_start(drive);
 }
 
-static void sl82c105_dma_timeout(ide_drive_t *drive)
+static int sl82c105_ide_dma_timeout(ide_drive_t *drive)
 {
-	DBG(("sl82c105_dma_timeout(drive:%s)\n", drive->name));
+	ide_hwif_t *hwif	= HWIF(drive);
+	struct pci_dev *dev	= hwif->pci_dev;
 
-	sl82c105_reset_host(HWIF(drive)->pci_dev);
-	ide_dma_timeout(drive);
+	DBG(("sl82c105_ide_dma_timeout(drive:%s)\n", drive->name));
+
+	sl82c105_reset_host(dev);
+	return __ide_dma_timeout(drive);
 }
 
 static int sl82c105_ide_dma_on(ide_drive_t *drive)
@@ -435,9 +441,9 @@ static void __devinit init_hwif_sl82c105(ide_hwif_t *hwif)
 	hwif->ide_dma_check		= &sl82c105_ide_dma_check;
 	hwif->ide_dma_on		= &sl82c105_ide_dma_on;
 	hwif->dma_off_quietly		= &sl82c105_dma_off_quietly;
-	hwif->dma_lost_irq		= &sl82c105_dma_lost_irq;
+	hwif->ide_dma_lostirq		= &sl82c105_ide_dma_lostirq;
 	hwif->dma_start			= &sl82c105_dma_start;
-	hwif->dma_timeout		= &sl82c105_dma_timeout;
+	hwif->ide_dma_timeout		= &sl82c105_ide_dma_timeout;
 
 	if (!noautodma)
 		hwif->autodma = 1;
diff --git a/trunk/drivers/ide/pci/slc90e66.c b/trunk/drivers/ide/pci/slc90e66.c
index 575dbbd8b482..c40f291f91e0 100644
--- a/trunk/drivers/ide/pci/slc90e66.c
+++ b/trunk/drivers/ide/pci/slc90e66.c
@@ -199,9 +199,10 @@ static void __devinit init_hwif_slc90e66 (ide_hwif_t *hwif)
 	hwif->mwdma_mask = 0x06;
 	hwif->swdma_mask = 0x04;
 
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
+	if (!hwif->udma_four) {
 		/* bit[0(1)]: 0:80, 1:40 */
-		hwif->cbl = (reg47 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
+		hwif->udma_four = (reg47 & mask) ? 0 : 1;
+	}
 
 	hwif->ide_dma_check = &slc90e66_config_drive_xfer_rate;
 
diff --git a/trunk/drivers/ide/pci/tc86c001.c b/trunk/drivers/ide/pci/tc86c001.c
index 8de1f8e22494..cee619bb2eaf 100644
--- a/trunk/drivers/ide/pci/tc86c001.c
+++ b/trunk/drivers/ide/pci/tc86c001.c
@@ -220,13 +220,13 @@ static void __devinit init_hwif_tc86c001(ide_hwif_t *hwif)
 	hwif->ide_dma_check	= &tc86c001_config_drive_xfer_rate;
 	hwif->dma_start 	= &tc86c001_dma_start;
 
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT) {
+	if (!hwif->udma_four) {
 		/*
 		 * System Control  1 Register bit 13 (PDIAGN):
 		 * 0=80-pin cable, 1=40-pin cable
 		 */
 		scr1 = hwif->INW(sc_base + 0x00);
-		hwif->cbl = (scr1 & 0x2000) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
+		hwif->udma_four = (scr1 & 0x2000) ? 0 : 1;
 	}
 
 	if (!noautodma)
diff --git a/trunk/drivers/ide/pci/via82cxxx.c b/trunk/drivers/ide/pci/via82cxxx.c
index d21dd2e7eeb3..a508550c4095 100644
--- a/trunk/drivers/ide/pci/via82cxxx.c
+++ b/trunk/drivers/ide/pci/via82cxxx.c
@@ -1,6 +1,6 @@
 /*
  *
- * Version 3.45
+ * Version 3.38
  *
  * VIA IDE driver for Linux. Supported southbridges:
  *
@@ -9,7 +9,6 @@
  *   vt8235, vt8237, vt8237a
  *
  * Copyright (c) 2000-2002 Vojtech Pavlik
- * Copyright (c) 2007 Bartlomiej Zolnierkiewicz
  *
  * Based on the work of:
  *	Michel Aubry
@@ -34,8 +33,6 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/ide.h>
-#include <linux/dmi.h>
-
 #include <asm/io.h>
 
 #ifdef CONFIG_PPC_CHRP
@@ -44,6 +41,8 @@
 
 #include "ide-timing.h"
 
+#define DISPLAY_VIA_TIMINGS
+
 #define VIA_IDE_ENABLE		0x40
 #define VIA_IDE_CONFIG		0x41
 #define VIA_FIFO_CONFIG		0x43
@@ -55,12 +54,18 @@
 #define VIA_ADDRESS_SETUP	0x4c
 #define VIA_UDMA_TIMING		0x50
 
-#define VIA_BAD_PREQ		0x01 /* Crashes if PREQ# till DDACK# set */
-#define VIA_BAD_CLK66		0x02 /* 66 MHz clock doesn't work correctly */
-#define VIA_SET_FIFO		0x04 /* Needs to have FIFO split set */
-#define VIA_NO_UNMASK		0x08 /* Doesn't work with IRQ unmasking on */
-#define VIA_BAD_ID		0x10 /* Has wrong vendor ID (0x1107) */
-#define VIA_BAD_AST		0x20 /* Don't touch Address Setup Timing */
+#define VIA_UDMA		0x007
+#define VIA_UDMA_NONE		0x000
+#define VIA_UDMA_33		0x001
+#define VIA_UDMA_66		0x002
+#define VIA_UDMA_100		0x003
+#define VIA_UDMA_133		0x004
+#define VIA_BAD_PREQ		0x010	/* Crashes if PREQ# till DDACK# set */
+#define VIA_BAD_CLK66		0x020	/* 66 MHz clock doesn't work correctly */
+#define VIA_SET_FIFO		0x040	/* Needs to have FIFO split set */
+#define VIA_NO_UNMASK		0x080	/* Doesn't work with IRQ unmasking on */
+#define VIA_BAD_ID		0x100	/* Has wrong vendor ID (0x1107) */
+#define VIA_BAD_AST		0x200	/* Don't touch Address Setup Timing */
 
 /*
  * VIA SouthBridge chips.
@@ -71,37 +76,36 @@ static struct via_isa_bridge {
 	u16 id;
 	u8 rev_min;
 	u8 rev_max;
-	u8 udma_mask;
-	u8 flags;
+	u16 flags;
 } via_isa_bridges[] = {
-	{ "cx700",	PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8237s",	PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt6410",	PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8251",	PCI_DEVICE_ID_VIA_8251,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8237",	PCI_DEVICE_ID_VIA_8237,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8237a",	PCI_DEVICE_ID_VIA_8237A,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8235",	PCI_DEVICE_ID_VIA_8235,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8233a",	PCI_DEVICE_ID_VIA_8233A,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8233c",	PCI_DEVICE_ID_VIA_8233C_0,  0x00, 0x2f, ATA_UDMA5, },
-	{ "vt8233",	PCI_DEVICE_ID_VIA_8233_0,   0x00, 0x2f, ATA_UDMA5, },
-	{ "vt8231",	PCI_DEVICE_ID_VIA_8231,     0x00, 0x2f, ATA_UDMA5, },
-	{ "vt82c686b",	PCI_DEVICE_ID_VIA_82C686,   0x40, 0x4f, ATA_UDMA5, },
-	{ "vt82c686a",	PCI_DEVICE_ID_VIA_82C686,   0x10, 0x2f, ATA_UDMA4, },
-	{ "vt82c686",	PCI_DEVICE_ID_VIA_82C686,   0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 },
-	{ "vt82c596b",	PCI_DEVICE_ID_VIA_82C596,   0x10, 0x2f, ATA_UDMA4, },
-	{ "vt82c596a",	PCI_DEVICE_ID_VIA_82C596,   0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 },
-	{ "vt82c586b",	PCI_DEVICE_ID_VIA_82C586_0, 0x47, 0x4f, ATA_UDMA2, VIA_SET_FIFO },
-	{ "vt82c586b",	PCI_DEVICE_ID_VIA_82C586_0, 0x40, 0x46, ATA_UDMA2, VIA_SET_FIFO | VIA_BAD_PREQ },
-	{ "vt82c586b",	PCI_DEVICE_ID_VIA_82C586_0, 0x30, 0x3f, ATA_UDMA2, VIA_SET_FIFO },
-	{ "vt82c586a",	PCI_DEVICE_ID_VIA_82C586_0, 0x20, 0x2f, ATA_UDMA2, VIA_SET_FIFO },
-	{ "vt82c586",	PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f,      0x00, VIA_SET_FIFO },
-	{ "vt82c576",	PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f,      0x00, VIA_SET_FIFO | VIA_NO_UNMASK },
-	{ "vt82c576",	PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f,      0x00, VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID },
+	{ "cx700",	PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+	{ "vt8237s",	PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+	{ "vt6410",	PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+	{ "vt8251",	PCI_DEVICE_ID_VIA_8251,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+	{ "vt8237",	PCI_DEVICE_ID_VIA_8237,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+	{ "vt8237a",	PCI_DEVICE_ID_VIA_8237A,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+	{ "vt8235",	PCI_DEVICE_ID_VIA_8235,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+	{ "vt8233a",	PCI_DEVICE_ID_VIA_8233A,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+	{ "vt8233c",	PCI_DEVICE_ID_VIA_8233C_0,  0x00, 0x2f, VIA_UDMA_100 },
+	{ "vt8233",	PCI_DEVICE_ID_VIA_8233_0,   0x00, 0x2f, VIA_UDMA_100 },
+	{ "vt8231",	PCI_DEVICE_ID_VIA_8231,     0x00, 0x2f, VIA_UDMA_100 },
+	{ "vt82c686b",	PCI_DEVICE_ID_VIA_82C686,   0x40, 0x4f, VIA_UDMA_100 },
+	{ "vt82c686a",	PCI_DEVICE_ID_VIA_82C686,   0x10, 0x2f, VIA_UDMA_66 },
+	{ "vt82c686",	PCI_DEVICE_ID_VIA_82C686,   0x00, 0x0f, VIA_UDMA_33 | VIA_BAD_CLK66 },
+	{ "vt82c596b",	PCI_DEVICE_ID_VIA_82C596,   0x10, 0x2f, VIA_UDMA_66 },
+	{ "vt82c596a",	PCI_DEVICE_ID_VIA_82C596,   0x00, 0x0f, VIA_UDMA_33 | VIA_BAD_CLK66 },
+	{ "vt82c586b",	PCI_DEVICE_ID_VIA_82C586_0, 0x47, 0x4f, VIA_UDMA_33 | VIA_SET_FIFO },
+	{ "vt82c586b",	PCI_DEVICE_ID_VIA_82C586_0, 0x40, 0x46, VIA_UDMA_33 | VIA_SET_FIFO | VIA_BAD_PREQ },
+	{ "vt82c586b",	PCI_DEVICE_ID_VIA_82C586_0, 0x30, 0x3f, VIA_UDMA_33 | VIA_SET_FIFO },
+	{ "vt82c586a",	PCI_DEVICE_ID_VIA_82C586_0, 0x20, 0x2f, VIA_UDMA_33 | VIA_SET_FIFO },
+	{ "vt82c586",	PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f, VIA_UDMA_NONE | VIA_SET_FIFO },
+	{ "vt82c576",	PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK },
+	{ "vt82c576",	PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID },
 	{ NULL }
 };
 
 static unsigned int via_clock;
-static char *via_dma[] = { "16", "25", "33", "44", "66", "100", "133" };
+static char *via_dma[] = { "MWDMA16", "UDMA33", "UDMA66", "UDMA100", "UDMA133" };
 
 struct via82cxxx_dev
 {
@@ -136,12 +140,12 @@ static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing)
 	pci_write_config_byte(dev, VIA_DRIVE_TIMING + (3 - dn),
 		((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1));
 
-	switch (vdev->via_config->udma_mask) {
-	case ATA_UDMA2: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
-	case ATA_UDMA4: t = timing->udma ? (0xe8 | (FIT(timing->udma, 2, 9) - 2)) : 0x0f; break;
-	case ATA_UDMA5: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
-	case ATA_UDMA6: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
-	default: return;
+	switch (vdev->via_config->flags & VIA_UDMA) {
+		case VIA_UDMA_33:  t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
+		case VIA_UDMA_66:  t = timing->udma ? (0xe8 | (FIT(timing->udma, 2, 9) - 2)) : 0x0f; break;
+		case VIA_UDMA_100: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
+		case VIA_UDMA_133: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
+		default: return;
 	}
 
 	pci_write_config_byte(dev, VIA_UDMA_TIMING + (3 - dn), t);
@@ -169,12 +173,12 @@ static int via_set_drive(ide_drive_t *drive, u8 speed)
 
 	T = 1000000000 / via_clock;
 
-	switch (vdev->via_config->udma_mask) {
-	case ATA_UDMA2: UT = T;   break;
-	case ATA_UDMA4: UT = T/2; break;
-	case ATA_UDMA5: UT = T/3; break;
-	case ATA_UDMA6: UT = T/4; break;
-	default:	UT = T;
+	switch (vdev->via_config->flags & VIA_UDMA) {
+		case VIA_UDMA_33:   UT = T;   break;
+		case VIA_UDMA_66:   UT = T/2; break;
+		case VIA_UDMA_100:  UT = T/3; break;
+		case VIA_UDMA_133:  UT = T/4; break;
+		default: UT = T;
 	}
 
 	ide_timing_compute(drive, speed, &t, T, UT);
@@ -204,7 +208,8 @@ static int via_set_drive(ide_drive_t *drive, u8 speed)
 static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio)
 {
 	if (pio == 255) {
-		via_set_drive(drive, ide_find_best_pio_mode(drive));
+		via_set_drive(drive,
+			ide_find_best_mode(drive, XFER_PIO | XFER_EPIO));
 		return;
 	}
 
@@ -221,10 +226,16 @@ static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio)
  
 static int via82cxxx_ide_dma_check (ide_drive_t *drive)
 {
-	u8 speed = ide_max_dma_mode(drive);
+	ide_hwif_t *hwif = HWIF(drive);
+	struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
+	u16 w80 = hwif->udma_four;
 
-	if (speed == 0)
-		speed = ide_find_best_pio_mode(drive);
+	u16 speed = ide_find_best_mode(drive,
+		XFER_PIO | XFER_EPIO | XFER_SWDMA | XFER_MWDMA |
+		(vdev->via_config->flags & VIA_UDMA ? XFER_UDMA : 0) |
+		(w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_66 ? XFER_UDMA_66 : 0) |
+		(w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_100 ? XFER_UDMA_100 : 0) |
+		(w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_133 ? XFER_UDMA_133 : 0));
 
 	via_set_drive(drive, speed);
 
@@ -261,8 +272,8 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
 {
 	int i;
 
-	switch (vdev->via_config->udma_mask) {
-		case ATA_UDMA4:
+	switch (vdev->via_config->flags & VIA_UDMA) {
+		case VIA_UDMA_66:
 			for (i = 24; i >= 0; i -= 8)
 				if (((u >> (i & 16)) & 8) &&
 				    ((u >> i) & 0x20) &&
@@ -275,7 +286,7 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
 				}
 			break;
 
-		case ATA_UDMA5:
+		case VIA_UDMA_100:
 			for (i = 24; i >= 0; i -= 8)
 				if (((u >> i) & 0x10) ||
 				    (((u >> i) & 0x20) &&
@@ -287,7 +298,7 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
 				}
 			break;
 
-		case ATA_UDMA6:
+		case VIA_UDMA_133:
 			for (i = 24; i >= 0; i -= 8)
 				if (((u >> i) & 0x10) ||
 				    (((u >> i) & 0x20) &&
@@ -342,7 +353,7 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
 
 	via_cable_detect(vdev, u);
 
-	if (via_config->udma_mask == ATA_UDMA4) {
+	if ((via_config->flags & VIA_UDMA) == VIA_UDMA_66) {
 		/* Enable Clk66 */
 		pci_write_config_dword(dev, VIA_UDMA_TIMING, u|0x80008);
 	} else if (via_config->flags & VIA_BAD_CLK66) {
@@ -405,54 +416,16 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
 	 */
 
 	pci_read_config_byte(isa, PCI_REVISION_ID, &t);
-	printk(KERN_INFO "VP_IDE: VIA %s (rev %02x) IDE %sDMA%s "
+	printk(KERN_INFO "VP_IDE: VIA %s (rev %02x) IDE %s "
 		"controller on pci%s\n",
 		via_config->name, t,
-		via_config->udma_mask ? "U" : "MW",
-		via_dma[via_config->udma_mask ?
-			(fls(via_config->udma_mask) - 1) : 0],
+		via_dma[via_config->flags & VIA_UDMA],
 		pci_name(dev));
 
 	pci_dev_put(isa);
 	return 0;
 }
 
-/*
- *	Cable special cases
- */
-
-static struct dmi_system_id cable_dmi_table[] = {
-	{
-		.ident = "Acer Ferrari 3400",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Acer,Inc."),
-			DMI_MATCH(DMI_BOARD_NAME, "Ferrari 3400"),
-		},
-	},
-	{ }
-};
-
-static int via_cable_override(void)
-{
-	/* Systems by DMI */
-	if (dmi_check_system(cable_dmi_table))
-		return 1;
-	return 0;
-}
-
-static u8 __devinit via82cxxx_cable_detect(ide_hwif_t *hwif)
-{
-	struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
-
-	if (via_cable_override())
-		return ATA_CBL_PATA40_SHORT;
-
-	if ((vdev->via_80w >> hwif->channel) & 1)
-		return ATA_CBL_PATA80;
-	else
-		return ATA_CBL_PATA40;
-}
-
 static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif)
 {
 	struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
@@ -481,14 +454,12 @@ static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif)
 		return;
 
 	hwif->atapi_dma = 1;
-
-	hwif->ultra_mask = vdev->via_config->udma_mask;
+	hwif->ultra_mask = 0x7f;
 	hwif->mwdma_mask = 0x07;
 	hwif->swdma_mask = 0x07;
 
-	if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-		hwif->cbl = via82cxxx_cable_detect(hwif);
-
+	if (!hwif->udma_four)
+		hwif->udma_four = (vdev->via_80w >> hwif->channel) & 1;
 	hwif->ide_dma_check = &via82cxxx_ide_dma_check;
 	if (!noautodma)
 		hwif->autodma = 1;
diff --git a/trunk/drivers/ide/ppc/pmac.c b/trunk/drivers/ide/ppc/pmac.c
index e46f47206542..45fc36f0f219 100644
--- a/trunk/drivers/ide/ppc/pmac.c
+++ b/trunk/drivers/ide/ppc/pmac.c
@@ -942,8 +942,8 @@ pmac_ide_tune_chipset (ide_drive_t *drive, byte speed)
 				return 1;
 		case XFER_UDMA_4:
 		case XFER_UDMA_3:
-			if (drive->hwif->cbl != ATA_CBL_PATA80)
-				return 1;
+			if (HWIF(drive)->udma_four == 0)
+				return 1;		
 		case XFER_UDMA_2:
 		case XFER_UDMA_1:
 		case XFER_UDMA_0:
@@ -1244,7 +1244,7 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
 	hwif->chipset = ide_pmac;
 	hwif->noprobe = !hwif->io_ports[IDE_DATA_OFFSET] || pmif->mediabay;
 	hwif->hold = pmif->mediabay;
-	hwif->cbl = pmif->cable_80 ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
+	hwif->udma_four = pmif->cable_80;
 	hwif->drives[0].unmask = 1;
 	hwif->drives[1].unmask = 1;
 	hwif->tuneproc = pmac_ide_tuneproc;
@@ -1821,11 +1821,28 @@ pmac_ide_dma_check(ide_drive_t *drive)
 		enable = 0;
 
 	if (enable) {
-		u8 mode = ide_max_dma_mode(drive);
-
-		if (mode >= XFER_UDMA_0)
+		short mode;
+		
+		map = XFER_MWDMA;
+		if (pmif->kind == controller_kl_ata4
+		    || pmif->kind == controller_un_ata6
+		    || pmif->kind == controller_k2_ata6
+		    || pmif->kind == controller_sh_ata6) {
+			map |= XFER_UDMA;
+			if (pmif->cable_80) {
+				map |= XFER_UDMA_66;
+				if (pmif->kind == controller_un_ata6 ||
+				    pmif->kind == controller_k2_ata6 ||
+				    pmif->kind == controller_sh_ata6)
+					map |= XFER_UDMA_100;
+				if (pmif->kind == controller_sh_ata6)
+					map |= XFER_UDMA_133;
+			}
+		}
+		mode = ide_find_best_mode(drive, map);
+		if (mode & XFER_UDMA)
 			drive->using_dma = pmac_ide_udma_enable(drive, mode);
-		else if (mode >= XFER_MW_DMA_0)
+		else if (mode & XFER_MWDMA)
 			drive->using_dma = pmac_ide_mdma_enable(drive, mode);
 		hwif->OUTB(0, IDE_CONTROL_REG);
 		/* Apply settings to controller */
@@ -1987,19 +2004,20 @@ static void pmac_ide_dma_host_on(ide_drive_t *drive)
 {
 }
 
-static void
-pmac_ide_dma_lost_irq (ide_drive_t *drive)
+static int
+pmac_ide_dma_lostirq (ide_drive_t *drive)
 {
 	pmac_ide_hwif_t* pmif = (pmac_ide_hwif_t *)HWIF(drive)->hwif_data;
 	volatile struct dbdma_regs __iomem *dma;
 	unsigned long status;
 
 	if (pmif == NULL)
-		return;
+		return 0;
 	dma = pmif->dma_regs;
 
 	status = readl(&dma->status);
 	printk(KERN_ERR "ide-pmac lost interrupt, dma status: %lx\n", status);
+	return 0;
 }
 
 /*
@@ -2039,8 +2057,8 @@ pmac_ide_setup_dma(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
 	hwif->ide_dma_test_irq = &pmac_ide_dma_test_irq;
 	hwif->dma_host_off = &pmac_ide_dma_host_off;
 	hwif->dma_host_on = &pmac_ide_dma_host_on;
-	hwif->dma_timeout = &ide_dma_timeout;
-	hwif->dma_lost_irq = &pmac_ide_dma_lost_irq;
+	hwif->ide_dma_timeout = &__ide_dma_timeout;
+	hwif->ide_dma_lostirq = &pmac_ide_dma_lostirq;
 
 	hwif->atapi_dma = 1;
 	switch(pmif->kind) {
diff --git a/trunk/drivers/misc/Kconfig b/trunk/drivers/misc/Kconfig
index 616eee9c04f1..bd601efa7bd1 100644
--- a/trunk/drivers/misc/Kconfig
+++ b/trunk/drivers/misc/Kconfig
@@ -34,6 +34,11 @@ config PHANTOM
 	  If you choose to build module, its name will be phantom. If unsure,
 	  say N here.
 
+config EEPROM_93CX6
+	tristate "EEPROM 93CX6 support"
+	---help---
+	  This is a driver for the EEPROM chipsets 93c46 and 93c66.
+	  The driver supports both read as well as write commands.
 
 	  If unsure, say N.
 
@@ -187,5 +192,4 @@ config THINKPAD_ACPI_BAY
 
 	  If you are not sure, say Y here.
 
-
 endmenu
diff --git a/trunk/drivers/misc/Makefile b/trunk/drivers/misc/Makefile
index 8abbf2f07a65..b5ce0e3dba86 100644
--- a/trunk/drivers/misc/Makefile
+++ b/trunk/drivers/misc/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_PHANTOM)		+= phantom.o
 obj-$(CONFIG_SGI_IOC4)		+= ioc4.o
 obj-$(CONFIG_SONY_LAPTOP)	+= sony-laptop.o
 obj-$(CONFIG_THINKPAD_ACPI)	+= thinkpad_acpi.o
+obj-$(CONFIG_EEPROM_93CX6)	+= eeprom_93cx6.o
diff --git a/trunk/drivers/misc/eeprom_93cx6.c b/trunk/drivers/misc/eeprom_93cx6.c
new file mode 100644
index 000000000000..bfcb43424dcd
--- /dev/null
+++ b/trunk/drivers/misc/eeprom_93cx6.c
@@ -0,0 +1,229 @@
+/*
+	Copyright (C) 2004 - 2006 rt2x00 SourceForge Project
+	<http://rt2x00.serialmonkey.com>
+
+	This program is free software; you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation; either version 2 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program; if not, write to the
+	Free Software Foundation, Inc.,
+	59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+/*
+	Module: eeprom_93cx6
+	Abstract: EEPROM reader routines for 93cx6 chipsets.
+	Supported chipsets: 93c46 & 93c66.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/eeprom_93cx6.h>
+
+MODULE_AUTHOR("http://rt2x00.serialmonkey.com");
+MODULE_VERSION("1.0");
+MODULE_DESCRIPTION("EEPROM 93cx6 chip driver");
+MODULE_LICENSE("GPL");
+
+static inline void eeprom_93cx6_pulse_high(struct eeprom_93cx6 *eeprom)
+{
+	eeprom->reg_data_clock = 1;
+	eeprom->register_write(eeprom);
+	udelay(1);
+}
+
+static inline void eeprom_93cx6_pulse_low(struct eeprom_93cx6 *eeprom)
+{
+	eeprom->reg_data_clock = 0;
+	eeprom->register_write(eeprom);
+	udelay(1);
+}
+
+static void eeprom_93cx6_startup(struct eeprom_93cx6 *eeprom)
+{
+	/*
+	 * Clear all flags, and enable chip select.
+	 */
+	eeprom->register_read(eeprom);
+	eeprom->reg_data_in = 0;
+	eeprom->reg_data_out = 0;
+	eeprom->reg_data_clock = 0;
+	eeprom->reg_chip_select = 1;
+	eeprom->register_write(eeprom);
+
+	/*
+	 * kick a pulse.
+	 */
+	eeprom_93cx6_pulse_high(eeprom);
+	eeprom_93cx6_pulse_low(eeprom);
+}
+
+static void eeprom_93cx6_cleanup(struct eeprom_93cx6 *eeprom)
+{
+	/*
+	 * Clear chip_select and data_in flags.
+	 */
+	eeprom->register_read(eeprom);
+	eeprom->reg_data_in = 0;
+	eeprom->reg_chip_select = 0;
+	eeprom->register_write(eeprom);
+
+	/*
+	 * kick a pulse.
+	 */
+	eeprom_93cx6_pulse_high(eeprom);
+	eeprom_93cx6_pulse_low(eeprom);
+}
+
+static void eeprom_93cx6_write_bits(struct eeprom_93cx6 *eeprom,
+	const u16 data, const u16 count)
+{
+	unsigned int i;
+
+	eeprom->register_read(eeprom);
+
+	/*
+	 * Clear data flags.
+	 */
+	eeprom->reg_data_in = 0;
+	eeprom->reg_data_out = 0;
+
+	/*
+	 * Start writing all bits.
+	 */
+	for (i = count; i > 0; i--) {
+		/*
+		 * Check if this bit needs to be set.
+		 */
+		eeprom->reg_data_in = !!(data & (1 << (i - 1)));
+
+		/*
+		 * Write the bit to the eeprom register.
+		 */
+		eeprom->register_write(eeprom);
+
+		/*
+		 * Kick a pulse.
+		 */
+		eeprom_93cx6_pulse_high(eeprom);
+		eeprom_93cx6_pulse_low(eeprom);
+	}
+
+	eeprom->reg_data_in = 0;
+	eeprom->register_write(eeprom);
+}
+
+static void eeprom_93cx6_read_bits(struct eeprom_93cx6 *eeprom,
+	u16 *data, const u16 count)
+{
+	unsigned int i;
+	u16 buf = 0;
+
+	eeprom->register_read(eeprom);
+
+	/*
+	 * Clear data flags.
+	 */
+	eeprom->reg_data_in = 0;
+	eeprom->reg_data_out = 0;
+
+	/*
+	 * Start reading all bits.
+	 */
+	for (i = count; i > 0; i--) {
+		eeprom_93cx6_pulse_high(eeprom);
+
+		eeprom->register_read(eeprom);
+
+		/*
+		 * Clear data_in flag.
+		 */
+		eeprom->reg_data_in = 0;
+
+		/*
+		 * Read if the bit has been set.
+		 */
+		if (eeprom->reg_data_out)
+			buf |= (1 << (i - 1));
+
+		eeprom_93cx6_pulse_low(eeprom);
+	}
+
+	*data = buf;
+}
+
+/**
+ * eeprom_93cx6_read - Read multiple words from eeprom
+ * @eeprom: Pointer to eeprom structure
+ * @word: Word index from where we should start reading
+ * @data: target pointer where the information will have to be stored
+ *
+ * This function will read the eeprom data as host-endian word
+ * into the given data pointer.
+ */
+void eeprom_93cx6_read(struct eeprom_93cx6 *eeprom, const u8 word,
+	u16 *data)
+{
+	u16 command;
+
+	/*
+	 * Initialize the eeprom register
+	 */
+	eeprom_93cx6_startup(eeprom);
+
+	/*
+	 * Select the read opcode and the word to be read.
+	 */
+	command = (PCI_EEPROM_READ_OPCODE << eeprom->width) | word;
+	eeprom_93cx6_write_bits(eeprom, command,
+		PCI_EEPROM_WIDTH_OPCODE + eeprom->width);
+
+	/*
+	 * Read the requested 16 bits.
+	 */
+	eeprom_93cx6_read_bits(eeprom, data, 16);
+
+	/*
+	 * Cleanup eeprom register.
+	 */
+	eeprom_93cx6_cleanup(eeprom);
+}
+EXPORT_SYMBOL_GPL(eeprom_93cx6_read);
+
+/**
+ * eeprom_93cx6_multiread - Read multiple words from eeprom
+ * @eeprom: Pointer to eeprom structure
+ * @word: Word index from where we should start reading
+ * @data: target pointer where the information will have to be stored
+ * @words: Number of words that should be read.
+ *
+ * This function will read all requested words from the eeprom,
+ * this is done by calling eeprom_93cx6_read() multiple times.
+ * But with the additional change that while the eeprom_93cx6_read
+ * will return host ordered bytes, this method will return little
+ * endian words.
+ */
+void eeprom_93cx6_multiread(struct eeprom_93cx6 *eeprom, const u8 word,
+	__le16 *data, const u16 words)
+{
+	unsigned int i;
+	u16 tmp;
+
+	for (i = 0; i < words; i++) {
+		tmp = 0;
+		eeprom_93cx6_read(eeprom, word + i, &tmp);
+		data[i] = cpu_to_le16(tmp);
+	}
+}
+EXPORT_SYMBOL_GPL(eeprom_93cx6_multiread);
+
diff --git a/trunk/fs/jfs/endian24.h b/trunk/fs/jfs/endian24.h
index fa92f7f1d0d0..79494c4f2b10 100644
--- a/trunk/fs/jfs/endian24.h
+++ b/trunk/fs/jfs/endian24.h
@@ -29,7 +29,7 @@
 	__u32 __x = (x); \
 	((__u32)( \
 		((__x & (__u32)0x000000ffUL) << 16) | \
-		 (__x & (__u32)0x0000ff00UL)	    | \
+		 (__x & (__u32)0x0000ff00UL)        | \
 		((__x & (__u32)0x00ff0000UL) >> 16) )); \
 })
 
diff --git a/trunk/fs/jfs/jfs_debug.c b/trunk/fs/jfs/jfs_debug.c
index 887f5759e536..9c5d59632aac 100644
--- a/trunk/fs/jfs/jfs_debug.c
+++ b/trunk/fs/jfs/jfs_debug.c
@@ -26,6 +26,34 @@
 #include "jfs_filsys.h"
 #include "jfs_debug.h"
 
+#ifdef CONFIG_JFS_DEBUG
+void dump_mem(char *label, void *data, int length)
+{
+	int i, j;
+	int *intptr = data;
+	char *charptr = data;
+	char buf[10], line[80];
+
+	printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length,
+	       data);
+	for (i = 0; i < length; i += 16) {
+		line[0] = 0;
+		for (j = 0; (j < 4) && (i + j * 4 < length); j++) {
+			sprintf(buf, " %08x", intptr[i / 4 + j]);
+			strcat(line, buf);
+		}
+		buf[0] = ' ';
+		buf[2] = 0;
+		for (j = 0; (j < 16) && (i + j < length); j++) {
+			buf[1] =
+			    isprint(charptr[i + j]) ? charptr[i + j] : '.';
+			strcat(line, buf);
+		}
+		printk("%s\n", line);
+	}
+}
+#endif
+
 #ifdef PROC_FS_JFS /* see jfs_debug.h */
 
 static struct proc_dir_entry *base;
diff --git a/trunk/fs/jfs/jfs_debug.h b/trunk/fs/jfs/jfs_debug.h
index 044c1e654cc0..7378798f0b21 100644
--- a/trunk/fs/jfs/jfs_debug.h
+++ b/trunk/fs/jfs/jfs_debug.h
@@ -62,6 +62,7 @@ extern void jfs_proc_clean(void);
 
 extern int jfsloglevel;
 
+extern void dump_mem(char *label, void *data, int length);
 extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
 
 /* information message: e.g., configuration, major event */
@@ -93,6 +94,7 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
  *	---------
  */
 #else				/* CONFIG_JFS_DEBUG */
+#define dump_mem(label,data,length) do {} while (0)
 #define ASSERT(p) do {} while (0)
 #define jfs_info(fmt, arg...) do {} while (0)
 #define jfs_debug(fmt, arg...) do {} while (0)
diff --git a/trunk/fs/jfs/jfs_dinode.h b/trunk/fs/jfs/jfs_dinode.h
index c387540d3425..40b20111383c 100644
--- a/trunk/fs/jfs/jfs_dinode.h
+++ b/trunk/fs/jfs/jfs_dinode.h
@@ -19,23 +19,23 @@
 #define _H_JFS_DINODE
 
 /*
- *	jfs_dinode.h: on-disk inode manager
+ *      jfs_dinode.h: on-disk inode manager
  */
 
-#define INODESLOTSIZE		128
-#define L2INODESLOTSIZE		7
-#define log2INODESIZE		9	/* log2(bytes per dinode) */
+#define INODESLOTSIZE           128
+#define L2INODESLOTSIZE         7
+#define log2INODESIZE           9	/* log2(bytes per dinode) */
 
 
 /*
- *	on-disk inode : 512 bytes
+ *      on-disk inode : 512 bytes
  *
  * note: align 64-bit fields on 8-byte boundary.
  */
 struct dinode {
 	/*
-	 *	I. base area (128 bytes)
-	 *	------------------------
+	 *      I. base area (128 bytes)
+	 *      ------------------------
 	 *
 	 * define generic/POSIX attributes
 	 */
@@ -70,16 +70,16 @@ struct dinode {
 	__le32 di_acltype;	/* 4: Type of ACL */
 
 	/*
-	 *	Extension Areas.
+	 *      Extension Areas.
 	 *
-	 *	Historically, the inode was partitioned into 4 128-byte areas,
-	 *	the last 3 being defined as unions which could have multiple
-	 *	uses.  The first 96 bytes had been completely unused until
-	 *	an index table was added to the directory.  It is now more
-	 *	useful to describe the last 3/4 of the inode as a single
-	 *	union.  We would probably be better off redesigning the
-	 *	entire structure from scratch, but we don't want to break
-	 *	commonality with OS/2's JFS at this time.
+	 *      Historically, the inode was partitioned into 4 128-byte areas,
+	 *      the last 3 being defined as unions which could have multiple
+	 *      uses.  The first 96 bytes had been completely unused until
+	 *      an index table was added to the directory.  It is now more
+	 *      useful to describe the last 3/4 of the inode as a single
+	 *      union.  We would probably be better off redesigning the
+	 *      entire structure from scratch, but we don't want to break
+	 *      commonality with OS/2's JFS at this time.
 	 */
 	union {
 		struct {
@@ -95,7 +95,7 @@ struct dinode {
 		} _dir;					/* (384) */
 #define di_dirtable	u._dir._table
 #define di_dtroot	u._dir._dtroot
-#define di_parent	di_dtroot.header.idotdot
+#define di_parent       di_dtroot.header.idotdot
 #define di_DASD		di_dtroot.header.DASD
 
 		struct {
@@ -127,14 +127,14 @@ struct dinode {
 #define di_inlinedata	u._file._u2._special._u
 #define di_rdev		u._file._u2._special._u._rdev
 #define di_fastsymlink	u._file._u2._special._u._fastsymlink
-#define di_inlineea	u._file._u2._special._inlineea
+#define di_inlineea     u._file._u2._special._inlineea
 	} u;
 };
 
 /* extended mode bits (on-disk inode di_mode) */
-#define IFJOURNAL	0x00010000	/* journalled file */
-#define ISPARSE		0x00020000	/* sparse file enabled */
-#define INLINEEA	0x00040000	/* inline EA area free */
+#define IFJOURNAL       0x00010000	/* journalled file */
+#define ISPARSE         0x00020000	/* sparse file enabled */
+#define INLINEEA        0x00040000	/* inline EA area free */
 #define ISWAPFILE	0x00800000	/* file open for pager swap space */
 
 /* more extended mode bits: attributes for OS/2 */
diff --git a/trunk/fs/jfs/jfs_dmap.c b/trunk/fs/jfs/jfs_dmap.c
index e1985066b1c6..f3b1ebb22280 100644
--- a/trunk/fs/jfs/jfs_dmap.c
+++ b/trunk/fs/jfs/jfs_dmap.c
@@ -154,12 +154,12 @@ static const s8 budtab[256] = {
  *		the in-core descriptor is initialized from disk.
  *
  * PARAMETERS:
- *	ipbmap	- pointer to in-core inode for the block map.
+ *      ipbmap	-  pointer to in-core inode for the block map.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOMEM	- insufficient memory
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOMEM	- insufficient memory
+ *      -EIO	- i/o error
  */
 int dbMount(struct inode *ipbmap)
 {
@@ -232,11 +232,11 @@ int dbMount(struct inode *ipbmap)
  *		the memory for this descriptor is freed.
  *
  * PARAMETERS:
- *	ipbmap	- pointer to in-core inode for the block map.
+ *      ipbmap	-  pointer to in-core inode for the block map.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error
+ *      0	- success
+ *      -EIO	- i/o error
  */
 int dbUnmount(struct inode *ipbmap, int mounterror)
 {
@@ -320,13 +320,13 @@ int dbSync(struct inode *ipbmap)
  *		at a time.
  *
  * PARAMETERS:
- *	ip	- pointer to in-core inode;
- *	blkno	- starting block number to be freed.
- *	nblocks	- number of blocks to be freed.
+ *      ip	-  pointer to in-core inode;
+ *      blkno	-  starting block number to be freed.
+ *      nblocks	-  number of blocks to be freed.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error
+ *      0	- success
+ *      -EIO	- i/o error
  */
 int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
 {
@@ -395,23 +395,23 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
 /*
  * NAME:	dbUpdatePMap()
  *
- * FUNCTION:	update the allocation state (free or allocate) of the
+ * FUNCTION:    update the allocation state (free or allocate) of the
  *		specified block range in the persistent block allocation map.
  *
  *		the blocks will be updated in the persistent map one
  *		dmap at a time.
  *
  * PARAMETERS:
- *	ipbmap	- pointer to in-core inode for the block map.
- *	free	- 'true' if block range is to be freed from the persistent
- *		  map; 'false' if it is to be allocated.
- *	blkno	- starting block number of the range.
- *	nblocks	- number of contiguous blocks in the range.
- *	tblk	- transaction block;
+ *      ipbmap	-  pointer to in-core inode for the block map.
+ *      free	-  'true' if block range is to be freed from the persistent
+ *		   map; 'false' if it is to   be allocated.
+ *      blkno	-  starting block number of the range.
+ *      nblocks	-  number of contiguous blocks in the range.
+ *      tblk	-  transaction block;
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error
+ *      0	- success
+ *      -EIO	- i/o error
  */
 int
 dbUpdatePMap(struct inode *ipbmap,
@@ -573,7 +573,7 @@ dbUpdatePMap(struct inode *ipbmap,
 /*
  * NAME:	dbNextAG()
  *
- * FUNCTION:	find the preferred allocation group for new allocations.
+ * FUNCTION:    find the preferred allocation group for new allocations.
  *
  *		Within the allocation groups, we maintain a preferred
  *		allocation group which consists of a group with at least
@@ -589,10 +589,10 @@ dbUpdatePMap(struct inode *ipbmap,
  *		empty ags around for large allocations.
  *
  * PARAMETERS:
- *	ipbmap	- pointer to in-core inode for the block map.
+ *      ipbmap	-  pointer to in-core inode for the block map.
  *
  * RETURN VALUES:
- *	the preferred allocation group number.
+ *      the preferred allocation group number.
  */
 int dbNextAG(struct inode *ipbmap)
 {
@@ -656,7 +656,7 @@ int dbNextAG(struct inode *ipbmap)
 /*
  * NAME:	dbAlloc()
  *
- * FUNCTION:	attempt to allocate a specified number of contiguous free
+ * FUNCTION:    attempt to allocate a specified number of contiguous free
  *		blocks from the working allocation block map.
  *
  *		the block allocation policy uses hints and a multi-step
@@ -680,16 +680,16 @@ int dbNextAG(struct inode *ipbmap)
  *		size or requests that specify no hint value.
  *
  * PARAMETERS:
- *	ip	- pointer to in-core inode;
- *	hint	- allocation hint.
- *	nblocks	- number of contiguous blocks in the range.
- *	results	- on successful return, set to the starting block number
+ *      ip	-  pointer to in-core inode;
+ *      hint	- allocation hint.
+ *      nblocks	- number of contiguous blocks in the range.
+ *      results	- on successful return, set to the starting block number
  *		  of the newly allocated contiguous range.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  */
 int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 {
@@ -706,6 +706,12 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	/* assert that nblocks is valid */
 	assert(nblocks > 0);
 
+#ifdef _STILL_TO_PORT
+	/* DASD limit check                                     F226941 */
+	if (OVER_LIMIT(ip, nblocks))
+		return -ENOSPC;
+#endif				/* _STILL_TO_PORT */
+
 	/* get the log2 number of blocks to be allocated.
 	 * if the number of blocks is not a log2 multiple,
 	 * it will be rounded up to the next log2 multiple.
@@ -714,6 +720,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 
 	bmp = JFS_SBI(ip->i_sb)->bmap;
 
+//retry:        /* serialize w.r.t.extendfs() */
 	mapSize = bmp->db_mapsize;
 
 	/* the hint should be within the map */
@@ -872,17 +879,17 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 /*
  * NAME:	dbAllocExact()
  *
- * FUNCTION:	try to allocate the requested extent;
+ * FUNCTION:    try to allocate the requested extent;
  *
  * PARAMETERS:
- *	ip	- pointer to in-core inode;
- *	blkno	- extent address;
- *	nblocks	- extent length;
+ *      ip	- pointer to in-core inode;
+ *      blkno	- extent address;
+ *      nblocks	- extent length;
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  */
 int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
 {
@@ -939,7 +946,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
 /*
  * NAME:	dbReAlloc()
  *
- * FUNCTION:	attempt to extend a current allocation by a specified
+ * FUNCTION:    attempt to extend a current allocation by a specified
  *		number of blocks.
  *
  *		this routine attempts to satisfy the allocation request
@@ -952,21 +959,21 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
  *		number of blocks required.
  *
  * PARAMETERS:
- *	ip	    -  pointer to in-core inode requiring allocation.
- *	blkno	    -  starting block of the current allocation.
- *	nblocks	    -  number of contiguous blocks within the current
+ *      ip	    -  pointer to in-core inode requiring allocation.
+ *      blkno	    -  starting block of the current allocation.
+ *      nblocks	    -  number of contiguous blocks within the current
  *		       allocation.
- *	addnblocks  -  number of blocks to add to the allocation.
- *	results	-      on successful return, set to the starting block number
+ *      addnblocks  -  number of blocks to add to the allocation.
+ *      results	-      on successful return, set to the starting block number
  *		       of the existing allocation if the existing allocation
  *		       was extended in place or to a newly allocated contiguous
  *		       range if the existing allocation could not be extended
  *		       in place.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  */
 int
 dbReAlloc(struct inode *ip,
@@ -997,7 +1004,7 @@ dbReAlloc(struct inode *ip,
 /*
  * NAME:	dbExtend()
  *
- * FUNCTION:	attempt to extend a current allocation by a specified
+ * FUNCTION:    attempt to extend a current allocation by a specified
  *		number of blocks.
  *
  *		this routine attempts to satisfy the allocation request
@@ -1006,16 +1013,16 @@ dbReAlloc(struct inode *ip,
  *		immediately following the current allocation.
  *
  * PARAMETERS:
- *	ip	    -  pointer to in-core inode requiring allocation.
- *	blkno	    -  starting block of the current allocation.
- *	nblocks	    -  number of contiguous blocks within the current
+ *      ip	    -  pointer to in-core inode requiring allocation.
+ *      blkno	    -  starting block of the current allocation.
+ *      nblocks	    -  number of contiguous blocks within the current
  *		       allocation.
- *	addnblocks  -  number of blocks to add to the allocation.
+ *      addnblocks  -  number of blocks to add to the allocation.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  */
 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
 {
@@ -1102,19 +1109,19 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
 /*
  * NAME:	dbAllocNext()
  *
- * FUNCTION:	attempt to allocate the blocks of the specified block
+ * FUNCTION:    attempt to allocate the blocks of the specified block
  *		range within a dmap.
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	dp	-  pointer to dmap.
- *	blkno	-  starting block number of the range.
- *	nblocks	-  number of contiguous free blocks of the range.
+ *      bmp	-  pointer to bmap descriptor
+ *      dp	-  pointer to dmap.
+ *      blkno	-  starting block number of the range.
+ *      nblocks	-  number of contiguous free blocks of the range.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
  */
@@ -1226,7 +1233,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
 /*
  * NAME:	dbAllocNear()
  *
- * FUNCTION:	attempt to allocate a number of contiguous free blocks near
+ * FUNCTION:    attempt to allocate a number of contiguous free blocks near
  *		a specified block (hint) within a dmap.
  *
  *		starting with the dmap leaf that covers the hint, we'll
@@ -1235,18 +1242,18 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
  *		the desired free space.
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	dp	-  pointer to dmap.
- *	blkno	-  block number to allocate near.
- *	nblocks	-  actual number of contiguous free blocks desired.
- *	l2nb	-  log2 number of contiguous free blocks desired.
- *	results	-  on successful return, set to the starting block number
+ *      bmp	-  pointer to bmap descriptor
+ *      dp	-  pointer to dmap.
+ *      blkno	-  block number to allocate near.
+ *      nblocks	-  actual number of contiguous free blocks desired.
+ *      l2nb	-  log2 number of contiguous free blocks desired.
+ *      results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
  */
@@ -1309,7 +1316,7 @@ dbAllocNear(struct bmap * bmp,
 /*
  * NAME:	dbAllocAG()
  *
- * FUNCTION:	attempt to allocate the specified number of contiguous
+ * FUNCTION:    attempt to allocate the specified number of contiguous
  *		free blocks within the specified allocation group.
  *
  *		unless the allocation group size is equal to the number
@@ -1346,17 +1353,17 @@ dbAllocNear(struct bmap * bmp,
  *		the allocation group.
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
+ *      bmp	-  pointer to bmap descriptor
  *	agno	- allocation group number.
- *	nblocks	-  actual number of contiguous free blocks desired.
- *	l2nb	-  log2 number of contiguous free blocks desired.
- *	results	-  on successful return, set to the starting block number
+ *      nblocks	-  actual number of contiguous free blocks desired.
+ *      l2nb	-  log2 number of contiguous free blocks desired.
+ *      results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  *
  * note: IWRITE_LOCK(ipmap) held on entry/exit;
  */
@@ -1539,7 +1546,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 /*
  * NAME:	dbAllocAny()
  *
- * FUNCTION:	attempt to allocate the specified number of contiguous
+ * FUNCTION:    attempt to allocate the specified number of contiguous
  *		free blocks anywhere in the file system.
  *
  *		dbAllocAny() attempts to find the sufficient free space by
@@ -1549,16 +1556,16 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
  *		desired free space is allocated.
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	nblocks	 -  actual number of contiguous free blocks desired.
- *	l2nb	 -  log2 number of contiguous free blocks desired.
- *	results	-  on successful return, set to the starting block number
+ *      bmp	-  pointer to bmap descriptor
+ *      nblocks	 -  actual number of contiguous free blocks desired.
+ *      l2nb	 -  log2 number of contiguous free blocks desired.
+ *      results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  *
  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
  */
@@ -1591,9 +1598,9 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
 /*
  * NAME:	dbFindCtl()
  *
- * FUNCTION:	starting at a specified dmap control page level and block
+ * FUNCTION:    starting at a specified dmap control page level and block
  *		number, search down the dmap control levels for a range of
- *		contiguous free blocks large enough to satisfy an allocation
+ *	        contiguous free blocks large enough to satisfy an allocation
  *		request for the specified number of free blocks.
  *
  *		if sufficient contiguous free blocks are found, this routine
@@ -1602,17 +1609,17 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
  *		is sufficient in size.
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	level	-  starting dmap control page level.
- *	l2nb	-  log2 number of contiguous free blocks desired.
- *	*blkno	-  on entry, starting block number for conducting the search.
+ *      bmp	-  pointer to bmap descriptor
+ *      level	-  starting dmap control page level.
+ *      l2nb	-  log2 number of contiguous free blocks desired.
+ *      *blkno	-  on entry, starting block number for conducting the search.
  *		   on successful return, the first block within a dmap page
  *		   that contains or starts a range of contiguous free blocks.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  *
  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
  */
@@ -1692,7 +1699,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
 /*
  * NAME:	dbAllocCtl()
  *
- * FUNCTION:	attempt to allocate a specified number of contiguous
+ * FUNCTION:    attempt to allocate a specified number of contiguous
  *		blocks starting within a specific dmap.
  *
  *		this routine is called by higher level routines that search
@@ -1719,18 +1726,18 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
  *		first dmap (i.e. blkno).
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	nblocks	 -  actual number of contiguous free blocks to allocate.
- *	l2nb	 -  log2 number of contiguous free blocks to allocate.
- *	blkno	 -  starting block number of the dmap to start the allocation
+ *      bmp	-  pointer to bmap descriptor
+ *      nblocks	 -  actual number of contiguous free blocks to allocate.
+ *      l2nb	 -  log2 number of contiguous free blocks to allocate.
+ *      blkno	 -  starting block number of the dmap to start the allocation
  *		    from.
- *	results	-  on successful return, set to the starting block number
+ *      results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  *
  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
  */
@@ -1863,7 +1870,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
 /*
  * NAME:	dbAllocDmapLev()
  *
- * FUNCTION:	attempt to allocate a specified number of contiguous blocks
+ * FUNCTION:    attempt to allocate a specified number of contiguous blocks
  *		from a specified dmap.
  *
  *		this routine checks if the contiguous blocks are available.
@@ -1871,17 +1878,17 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
  *		returned.
  *
  * PARAMETERS:
- *	mp	-  pointer to bmap descriptor
- *	dp	-  pointer to dmap to attempt to allocate blocks from.
- *	l2nb	-  log2 number of contiguous block desired.
- *	nblocks	-  actual number of contiguous block desired.
- *	results	-  on successful return, set to the starting block number
+ *      mp	-  pointer to bmap descriptor
+ *      dp	-  pointer to dmap to attempt to allocate blocks from.
+ *      l2nb	-  log2 number of contiguous block desired.
+ *      nblocks	-  actual number of contiguous block desired.
+ *      results	-  on successful return, set to the starting block number
  *		   of the newly allocated range.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient disk resources
- *	-EIO	- i/o error
+ *      0	- success
+ *      -ENOSPC	- insufficient disk resources
+ *      -EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
  *	IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
@@ -1926,7 +1933,7 @@ dbAllocDmapLev(struct bmap * bmp,
 /*
  * NAME:	dbAllocDmap()
  *
- * FUNCTION:	adjust the disk allocation map to reflect the allocation
+ * FUNCTION:    adjust the disk allocation map to reflect the allocation
  *		of a specified block range within a dmap.
  *
  *		this routine allocates the specified blocks from the dmap
@@ -1939,14 +1946,14 @@ dbAllocDmapLev(struct bmap * bmp,
  *		covers this dmap.
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	dp	-  pointer to dmap to allocate the block range from.
- *	blkno	-  starting block number of the block to be allocated.
- *	nblocks	-  number of blocks to be allocated.
+ *      bmp	-  pointer to bmap descriptor
+ *      dp	-  pointer to dmap to allocate the block range from.
+ *      blkno	-  starting block number of the block to be allocated.
+ *      nblocks	-  number of blocks to be allocated.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error
+ *      0	- success
+ *      -EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
@@ -1982,7 +1989,7 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 /*
  * NAME:	dbFreeDmap()
  *
- * FUNCTION:	adjust the disk allocation map to reflect the allocation
+ * FUNCTION:    adjust the disk allocation map to reflect the allocation
  *		of a specified block range within a dmap.
  *
  *		this routine frees the specified blocks from the dmap through
@@ -1990,18 +1997,18 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
  *		causes the maximum string of free blocks within the dmap to
  *		change (i.e. the value of the root of the dmap's dmtree), this
  *		routine will cause this change to be reflected up through the
- *		appropriate levels of the dmap control pages by a call to
+ *	        appropriate levels of the dmap control pages by a call to
  *		dbAdjCtl() for the L0 dmap control page that covers this dmap.
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	dp	-  pointer to dmap to free the block range from.
- *	blkno	-  starting block number of the block to be freed.
- *	nblocks	-  number of blocks to be freed.
+ *      bmp	-  pointer to bmap descriptor
+ *      dp	-  pointer to dmap to free the block range from.
+ *      blkno	-  starting block number of the block to be freed.
+ *      nblocks	-  number of blocks to be freed.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error
+ *      0	- success
+ *      -EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
@@ -2048,7 +2055,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 /*
  * NAME:	dbAllocBits()
  *
- * FUNCTION:	allocate a specified block range from a dmap.
+ * FUNCTION:    allocate a specified block range from a dmap.
  *
  *		this routine updates the dmap to reflect the working
  *		state allocation of the specified block range. it directly
@@ -2058,10 +2065,10 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
  *		dmap's dmtree, as a whole, to reflect the allocated range.
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	dp	-  pointer to dmap to allocate bits from.
- *	blkno	-  starting block number of the bits to be allocated.
- *	nblocks	-  number of bits to be allocated.
+ *      bmp	-  pointer to bmap descriptor
+ *      dp	-  pointer to dmap to allocate bits from.
+ *      blkno	-  starting block number of the bits to be allocated.
+ *      nblocks	-  number of bits to be allocated.
  *
  * RETURN VALUES: none
  *
@@ -2142,7 +2149,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 			 * the allocated words.
 			 */
 			for (; nwords > 0; nwords -= nw) {
-				if (leaf[word] < BUDMIN) {
+			        if (leaf[word] < BUDMIN) {
 					jfs_error(bmp->db_ipbmap->i_sb,
 						  "dbAllocBits: leaf page "
 						  "corrupt");
@@ -2195,7 +2202,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 /*
  * NAME:	dbFreeBits()
  *
- * FUNCTION:	free a specified block range from a dmap.
+ * FUNCTION:    free a specified block range from a dmap.
  *
  *		this routine updates the dmap to reflect the working
  *		state allocation of the specified block range. it directly
@@ -2205,10 +2212,10 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
  *		dmtree, as a whole, to reflect the deallocated range.
  *
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	dp	-  pointer to dmap to free bits from.
- *	blkno	-  starting block number of the bits to be freed.
- *	nblocks	-  number of bits to be freed.
+ *      bmp	-  pointer to bmap descriptor
+ *      dp	-  pointer to dmap to free bits from.
+ *      blkno	-  starting block number of the bits to be freed.
+ *      nblocks	-  number of bits to be freed.
  *
  * RETURN VALUES: 0 for success
  *
@@ -2381,19 +2388,19 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
  *		the new root value and the next dmap control page level to
  *		be adjusted.
  * PARAMETERS:
- *	bmp	-  pointer to bmap descriptor
- *	blkno	-  the first block of a block range within a dmap.  it is
+ *      bmp	-  pointer to bmap descriptor
+ *      blkno	-  the first block of a block range within a dmap.  it is
  *		   the allocation or deallocation of this block range that
  *		   requires the dmap control page to be adjusted.
- *	newval	-  the new value of the lower level dmap or dmap control
+ *      newval	-  the new value of the lower level dmap or dmap control
  *		   page root.
- *	alloc	-  'true' if adjustment is due to an allocation.
- *	level	-  current level of dmap control page (i.e. L0, L1, L2) to
+ *      alloc	-  'true' if adjustment is due to an allocation.
+ *      level	-  current level of dmap control page (i.e. L0, L1, L2) to
  *		   be adjusted.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error
+ *      0	- success
+ *      -EIO	- i/o error
  *
  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
  */
@@ -2537,16 +2544,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
 /*
  * NAME:	dbSplit()
  *
- * FUNCTION:	update the leaf of a dmtree with a new value, splitting
+ * FUNCTION:    update the leaf of a dmtree with a new value, splitting
  *		the leaf from the binary buddy system of the dmtree's
  *		leaves, as required.
  *
  * PARAMETERS:
- *	tp	- pointer to the tree containing the leaf.
- *	leafno	- the number of the leaf to be updated.
- *	splitsz	- the size the binary buddy system starting at the leaf
+ *      tp	- pointer to the tree containing the leaf.
+ *      leafno	- the number of the leaf to be updated.
+ *      splitsz	- the size the binary buddy system starting at the leaf
  *		  must be split to, specified as the log2 number of blocks.
- *	newval	- the new value for the leaf.
+ *      newval	- the new value for the leaf.
  *
  * RETURN VALUES: none
  *
@@ -2593,7 +2600,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
 /*
  * NAME:	dbBackSplit()
  *
- * FUNCTION:	back split the binary buddy system of dmtree leaves
+ * FUNCTION:    back split the binary buddy system of dmtree leaves
  *		that hold a specified leaf until the specified leaf
  *		starts its own binary buddy system.
  *
@@ -2610,8 +2617,8 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
  *		in which a previous join operation must be backed out.
  *
  * PARAMETERS:
- *	tp	- pointer to the tree containing the leaf.
- *	leafno	- the number of the leaf to be updated.
+ *      tp	- pointer to the tree containing the leaf.
+ *      leafno	- the number of the leaf to be updated.
  *
  * RETURN VALUES: none
  *
@@ -2685,14 +2692,14 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
 /*
  * NAME:	dbJoin()
  *
- * FUNCTION:	update the leaf of a dmtree with a new value, joining
+ * FUNCTION:    update the leaf of a dmtree with a new value, joining
  *		the leaf with other leaves of the dmtree into a multi-leaf
  *		binary buddy system, as required.
  *
  * PARAMETERS:
- *	tp	- pointer to the tree containing the leaf.
- *	leafno	- the number of the leaf to be updated.
- *	newval	- the new value for the leaf.
+ *      tp	- pointer to the tree containing the leaf.
+ *      leafno	- the number of the leaf to be updated.
+ *      newval	- the new value for the leaf.
  *
  * RETURN VALUES: none
  */
@@ -2778,15 +2785,15 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
 /*
  * NAME:	dbAdjTree()
  *
- * FUNCTION:	update a leaf of a dmtree with a new value, adjusting
+ * FUNCTION:    update a leaf of a dmtree with a new value, adjusting
  *		the dmtree, as required, to reflect the new leaf value.
  *		the combination of any buddies must already be done before
  *		this is called.
  *
  * PARAMETERS:
- *	tp	- pointer to the tree to be adjusted.
- *	leafno	- the number of the leaf to be updated.
- *	newval	- the new value for the leaf.
+ *      tp	- pointer to the tree to be adjusted.
+ *      leafno	- the number of the leaf to be updated.
+ *      newval	- the new value for the leaf.
  *
  * RETURN VALUES: none
  */
@@ -2845,7 +2852,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
 /*
  * NAME:	dbFindLeaf()
  *
- * FUNCTION:	search a dmtree_t for sufficient free blocks, returning
+ * FUNCTION:    search a dmtree_t for sufficient free blocks, returning
  *		the index of a leaf describing the free blocks if
  *		sufficient free blocks are found.
  *
@@ -2854,15 +2861,15 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
  *		free space.
  *
  * PARAMETERS:
- *	tp	- pointer to the tree to be searched.
- *	l2nb	- log2 number of free blocks to search for.
+ *      tp	- pointer to the tree to be searched.
+ *      l2nb	- log2 number of free blocks to search for.
  *	leafidx	- return pointer to be set to the index of the leaf
  *		  describing at least l2nb free blocks if sufficient
  *		  free blocks are found.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOSPC	- insufficient free blocks.
+ *      0	- success
+ *      -ENOSPC	- insufficient free blocks.
  */
 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
 {
@@ -2909,18 +2916,18 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
 /*
  * NAME:	dbFindBits()
  *
- * FUNCTION:	find a specified number of binary buddy free bits within a
+ * FUNCTION:    find a specified number of binary buddy free bits within a
  *		dmap bitmap word value.
  *
  *		this routine searches the bitmap value for (1 << l2nb) free
  *		bits at (1 << l2nb) alignments within the value.
  *
  * PARAMETERS:
- *	word	-  dmap bitmap word value.
- *	l2nb	-  number of free bits specified as a log2 number.
+ *      word	-  dmap bitmap word value.
+ *      l2nb	-  number of free bits specified as a log2 number.
  *
  * RETURN VALUES:
- *	starting bit number of free bits.
+ *      starting bit number of free bits.
  */
 static int dbFindBits(u32 word, int l2nb)
 {
@@ -2956,14 +2963,14 @@ static int dbFindBits(u32 word, int l2nb)
 /*
  * NAME:	dbMaxBud(u8 *cp)
  *
- * FUNCTION:	determine the largest binary buddy string of free
+ * FUNCTION:    determine the largest binary buddy string of free
  *		bits within 32-bits of the map.
  *
  * PARAMETERS:
- *	cp	-  pointer to the 32-bit value.
+ *      cp	-  pointer to the 32-bit value.
  *
  * RETURN VALUES:
- *	largest binary buddy of free bits within a dmap word.
+ *      largest binary buddy of free bits within a dmap word.
  */
 static int dbMaxBud(u8 * cp)
 {
@@ -2993,14 +3000,14 @@ static int dbMaxBud(u8 * cp)
 /*
  * NAME:	cnttz(uint word)
  *
- * FUNCTION:	determine the number of trailing zeros within a 32-bit
+ * FUNCTION:    determine the number of trailing zeros within a 32-bit
  *		value.
  *
  * PARAMETERS:
- *	value	-  32-bit value to be examined.
+ *      value	-  32-bit value to be examined.
  *
  * RETURN VALUES:
- *	count of trailing zeros
+ *      count of trailing zeros
  */
 static int cnttz(u32 word)
 {
@@ -3018,14 +3025,14 @@ static int cnttz(u32 word)
 /*
  * NAME:	cntlz(u32 value)
  *
- * FUNCTION:	determine the number of leading zeros within a 32-bit
+ * FUNCTION:    determine the number of leading zeros within a 32-bit
  *		value.
  *
  * PARAMETERS:
- *	value	-  32-bit value to be examined.
+ *      value	-  32-bit value to be examined.
  *
  * RETURN VALUES:
- *	count of leading zeros
+ *      count of leading zeros
  */
 static int cntlz(u32 value)
 {
@@ -3043,14 +3050,14 @@ static int cntlz(u32 value)
  * NAME:	blkstol2(s64 nb)
  *
  * FUNCTION:	convert a block count to its log2 value. if the block
- *		count is not a l2 multiple, it is rounded up to the next
+ *	        count is not a l2 multiple, it is rounded up to the next
  *		larger l2 multiple.
  *
  * PARAMETERS:
- *	nb	-  number of blocks
+ *      nb	-  number of blocks
  *
  * RETURN VALUES:
- *	log2 number of blocks
+ *      log2 number of blocks
  */
 static int blkstol2(s64 nb)
 {
@@ -3092,13 +3099,13 @@ static int blkstol2(s64 nb)
  *		at a time.
  *
  * PARAMETERS:
- *	ip	-  pointer to in-core inode;
- *	blkno	-  starting block number to be freed.
- *	nblocks	-  number of blocks to be freed.
+ *      ip	-  pointer to in-core inode;
+ *      blkno	-  starting block number to be freed.
+ *      nblocks	-  number of blocks to be freed.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error
+ *      0	- success
+ *      -EIO	- i/o error
  */
 int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
 {
@@ -3271,10 +3278,10 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
  * L2
  *  |
  *   L1---------------------------------L1
- *    |					 |
- *     L0---------L0---------L0		  L0---------L0---------L0
- *      |	   |	      |		   |	      |		 |
- *	 d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
+ *    |                                  |
+ *     L0---------L0---------L0           L0---------L0---------L0
+ *      |          |          |            |          |          |
+ *       d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
  * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
  *
  * <---old---><----------------------------extend----------------------->
@@ -3300,7 +3307,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 		 (long long) blkno, (long long) nblocks, (long long) newsize);
 
 	/*
-	 *	initialize bmap control page.
+	 *      initialize bmap control page.
 	 *
 	 * all the data in bmap control page should exclude
 	 * the mkfs hidden dmap page.
@@ -3323,7 +3330,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 	bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
 
 	/*
-	 *	reconfigure db_agfree[]
+	 *      reconfigure db_agfree[]
 	 * from old AG configuration to new AG configuration;
 	 *
 	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
@@ -3355,7 +3362,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 	bmp->db_maxag = bmp->db_maxag / k;
 
 	/*
-	 *	extend bmap
+	 *      extend bmap
 	 *
 	 * update bit maps and corresponding level control pages;
 	 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
@@ -3403,7 +3410,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 			/* compute start L0 */
 			j = 0;
 			l1leaf = l1dcp->stree + CTLLEAFIND;
-			p += nbperpage;	/* 1st L0 of L1.k */
+			p += nbperpage;	/* 1st L0 of L1.k  */
 		}
 
 		/*
@@ -3541,7 +3548,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 	return -EIO;
 
 	/*
-	 *	finalize bmap control page
+	 *      finalize bmap control page
 	 */
 finalize:
 
@@ -3560,7 +3567,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
 	int i, n;
 
 	/*
-	 *	finalize bmap control page
+	 *      finalize bmap control page
 	 */
 //finalize:
 	/*
@@ -3946,8 +3953,8 @@ static int dbGetL2AGSize(s64 nblocks)
  * convert number of map pages to the zero origin top dmapctl level
  */
 #define BMAPPGTOLEV(npages)	\
-	(((npages) <= 3 + MAXL0PAGES) ? 0 : \
-	 ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
+	(((npages) <= 3 + MAXL0PAGES) ? 0 \
+       : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
 
 s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
 {
@@ -3974,8 +3981,8 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
 		factor =
 		    (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
 		complete = (u32) npages / factor;
-		ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL :
-				      ((i == 1) ? LPERCTL : 1));
+		ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL
+				      : ((i == 1) ? LPERCTL : 1));
 
 		/* pages in last/incomplete child */
 		npages = (u32) npages % factor;
diff --git a/trunk/fs/jfs/jfs_dmap.h b/trunk/fs/jfs/jfs_dmap.h
index 11e6d471b364..45ea454c74bd 100644
--- a/trunk/fs/jfs/jfs_dmap.h
+++ b/trunk/fs/jfs/jfs_dmap.h
@@ -83,7 +83,7 @@ static __inline signed char TREEMAX(signed char *cp)
  *	- 1 is added to account for the control page of the map.
  */
 #define BLKTODMAP(b,s)    \
-	((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
+        ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
 
 /*
  * convert disk block number to the logical block number of the LEVEL 0
@@ -98,7 +98,7 @@ static __inline signed char TREEMAX(signed char *cp)
  *	- 1 is added to account for the control page of the map.
  */
 #define BLKTOL0(b,s)      \
-	(((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
+        (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
 
 /*
  * convert disk block number to the logical block number of the LEVEL 1
@@ -120,7 +120,7 @@ static __inline signed char TREEMAX(signed char *cp)
  * at the specified level which describes the disk block.
  */
 #define BLKTOCTL(b,s,l)   \
-	(((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
+        (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
 
 /*
  * convert aggregate map size to the zero origin dmapctl level of the
@@ -145,27 +145,27 @@ static __inline signed char TREEMAX(signed char *cp)
  * dmaptree must be consistent with dmapctl.
  */
 struct dmaptree {
-	__le32 nleafs;		/* 4: number of tree leafs	*/
-	__le32 l2nleafs;	/* 4: l2 number of tree leafs	*/
-	__le32 leafidx;		/* 4: index of first tree leaf	*/
-	__le32 height;		/* 4: height of the tree	*/
+	__le32 nleafs;		/* 4: number of tree leafs      */
+	__le32 l2nleafs;	/* 4: l2 number of tree leafs   */
+	__le32 leafidx;		/* 4: index of first tree leaf  */
+	__le32 height;		/* 4: height of the tree        */
 	s8 budmin;		/* 1: min l2 tree leaf value to combine */
-	s8 stree[TREESIZE];	/* TREESIZE: tree		*/
-	u8 pad[2];		/* 2: pad to word boundary	*/
-};				/* - 360 -			*/
+	s8 stree[TREESIZE];	/* TREESIZE: tree               */
+	u8 pad[2];		/* 2: pad to word boundary      */
+};				/* - 360 -                      */
 
 /*
  *	dmap page per 8K blocks bitmap
  */
 struct dmap {
-	__le32 nblocks;		/* 4: num blks covered by this dmap	*/
-	__le32 nfree;		/* 4: num of free blks in this dmap	*/
-	__le64 start;		/* 8: starting blkno for this dmap	*/
-	struct dmaptree tree;	/* 360: dmap tree			*/
-	u8 pad[1672];		/* 1672: pad to 2048 bytes		*/
-	__le32 wmap[LPERDMAP];	/* 1024: bits of the working map	*/
-	__le32 pmap[LPERDMAP];	/* 1024: bits of the persistent map	*/
-};				/* - 4096 -				*/
+	__le32 nblocks;		/* 4: num blks covered by this dmap     */
+	__le32 nfree;		/* 4: num of free blks in this dmap     */
+	__le64 start;		/* 8: starting blkno for this dmap      */
+	struct dmaptree tree;	/* 360: dmap tree                       */
+	u8 pad[1672];		/* 1672: pad to 2048 bytes              */
+	__le32 wmap[LPERDMAP];	/* 1024: bits of the working map        */
+	__le32 pmap[LPERDMAP];	/* 1024: bits of the persistent map     */
+};				/* - 4096 -                             */
 
 /*
  *	disk map control page per level.
@@ -173,14 +173,14 @@ struct dmap {
  * dmapctl must be consistent with dmaptree.
  */
 struct dmapctl {
-	__le32 nleafs;		/* 4: number of tree leafs	*/
-	__le32 l2nleafs;	/* 4: l2 number of tree leafs	*/
-	__le32 leafidx;		/* 4: index of the first tree leaf	*/
-	__le32 height;		/* 4: height of tree		*/
-	s8 budmin;		/* 1: minimum l2 tree leaf value	*/
-	s8 stree[CTLTREESIZE];	/* CTLTREESIZE: dmapctl tree	*/
-	u8 pad[2714];		/* 2714: pad to 4096		*/
-};				/* - 4096 -			*/
+	__le32 nleafs;		/* 4: number of tree leafs      */
+	__le32 l2nleafs;	/* 4: l2 number of tree leafs   */
+	__le32 leafidx;		/* 4: index of the first tree leaf      */
+	__le32 height;		/* 4: height of tree            */
+	s8 budmin;		/* 1: minimum l2 tree leaf value        */
+	s8 stree[CTLTREESIZE];	/* CTLTREESIZE: dmapctl tree    */
+	u8 pad[2714];		/* 2714: pad to 4096            */
+};				/* - 4096 -                     */
 
 /*
  *	common definition for dmaptree within dmap and dmapctl
@@ -202,41 +202,41 @@ typedef union dmtree {
  *	on-disk aggregate disk allocation map descriptor.
  */
 struct dbmap_disk {
-	__le64 dn_mapsize;	/* 8: number of blocks in aggregate	*/
-	__le64 dn_nfree;	/* 8: num free blks in aggregate map	*/
-	__le32 dn_l2nbperpage;	/* 4: number of blks per page		*/
-	__le32 dn_numag;	/* 4: total number of ags		*/
-	__le32 dn_maxlevel;	/* 4: number of active ags		*/
-	__le32 dn_maxag;	/* 4: max active alloc group number	*/
-	__le32 dn_agpref;	/* 4: preferred alloc group (hint)	*/
-	__le32 dn_aglevel;	/* 4: dmapctl level holding the AG	*/
-	__le32 dn_agheigth;	/* 4: height in dmapctl of the AG	*/
-	__le32 dn_agwidth;	/* 4: width in dmapctl of the AG	*/
-	__le32 dn_agstart;	/* 4: start tree index at AG height	*/
-	__le32 dn_agl2size;	/* 4: l2 num of blks per alloc group	*/
-	__le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count		*/
-	__le64 dn_agsize;	/* 8: num of blks per alloc group	*/
-	s8 dn_maxfreebud;	/* 1: max free buddy system		*/
-	u8 pad[3007];		/* 3007: pad to 4096			*/
-};				/* - 4096 -				*/
+	__le64 dn_mapsize;	/* 8: number of blocks in aggregate     */
+	__le64 dn_nfree;	/* 8: num free blks in aggregate map    */
+	__le32 dn_l2nbperpage;	/* 4: number of blks per page           */
+	__le32 dn_numag;	/* 4: total number of ags               */
+	__le32 dn_maxlevel;	/* 4: number of active ags              */
+	__le32 dn_maxag;	/* 4: max active alloc group number     */
+	__le32 dn_agpref;	/* 4: preferred alloc group (hint)      */
+	__le32 dn_aglevel;	/* 4: dmapctl level holding the AG      */
+	__le32 dn_agheigth;	/* 4: height in dmapctl of the AG       */
+	__le32 dn_agwidth;	/* 4: width in dmapctl of the AG        */
+	__le32 dn_agstart;	/* 4: start tree index at AG height     */
+	__le32 dn_agl2size;	/* 4: l2 num of blks per alloc group    */
+	__le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count           */
+	__le64 dn_agsize;	/* 8: num of blks per alloc group       */
+	s8 dn_maxfreebud;	/* 1: max free buddy system             */
+	u8 pad[3007];		/* 3007: pad to 4096                    */
+};				/* - 4096 -                             */
 
 struct dbmap {
-	s64 dn_mapsize;		/* number of blocks in aggregate	*/
-	s64 dn_nfree;		/* num free blks in aggregate map	*/
-	int dn_l2nbperpage;	/* number of blks per page		*/
-	int dn_numag;		/* total number of ags			*/
-	int dn_maxlevel;	/* number of active ags			*/
-	int dn_maxag;		/* max active alloc group number	*/
-	int dn_agpref;		/* preferred alloc group (hint)		*/
-	int dn_aglevel;		/* dmapctl level holding the AG		*/
-	int dn_agheigth;	/* height in dmapctl of the AG		*/
-	int dn_agwidth;		/* width in dmapctl of the AG		*/
-	int dn_agstart;		/* start tree index at AG height	*/
-	int dn_agl2size;	/* l2 num of blks per alloc group	*/
-	s64 dn_agfree[MAXAG];	/* per AG free count			*/
-	s64 dn_agsize;		/* num of blks per alloc group		*/
-	signed char dn_maxfreebud;	/* max free buddy system	*/
-};				/* - 4096 -				*/
+	s64 dn_mapsize;		/* number of blocks in aggregate     */
+	s64 dn_nfree;		/* num free blks in aggregate map    */
+	int dn_l2nbperpage;	/* number of blks per page           */
+	int dn_numag;		/* total number of ags               */
+	int dn_maxlevel;	/* number of active ags              */
+	int dn_maxag;		/* max active alloc group number     */
+	int dn_agpref;		/* preferred alloc group (hint)      */
+	int dn_aglevel;		/* dmapctl level holding the AG      */
+	int dn_agheigth;	/* height in dmapctl of the AG       */
+	int dn_agwidth;		/* width in dmapctl of the AG        */
+	int dn_agstart;		/* start tree index at AG height     */
+	int dn_agl2size;	/* l2 num of blks per alloc group    */
+	s64 dn_agfree[MAXAG];	/* per AG free count           */
+	s64 dn_agsize;		/* num of blks per alloc group       */
+	signed char dn_maxfreebud;	/* max free buddy system             */
+};				/* - 4096 -                             */
 /*
  *	in-memory aggregate disk allocation map descriptor.
  */
diff --git a/trunk/fs/jfs/jfs_dtree.c b/trunk/fs/jfs/jfs_dtree.c
index c14ba3cfa818..6d62f3222892 100644
--- a/trunk/fs/jfs/jfs_dtree.c
+++ b/trunk/fs/jfs/jfs_dtree.c
@@ -315,8 +315,8 @@ static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
 	lv = &llck->lv[llck->index];
 
 	/*
-	 *	Linelock slot size is twice the size of directory table
-	 *	slot size.  512 entries per page.
+	 *      Linelock slot size is twice the size of directory table
+	 *      slot size.  512 entries per page.
 	 */
 	lv->offset = ((index - 2) & 511) >> 1;
 	lv->length = 1;
@@ -615,7 +615,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
 	btstack->nsplit = 1;
 
 	/*
-	 *	search down tree from root:
+	 *      search down tree from root:
 	 *
 	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
 	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -659,7 +659,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
 			}
 			if (cmp == 0) {
 				/*
-				 *	search hit
+				 *      search hit
 				 */
 				/* search hit - leaf page:
 				 * return the entry found
@@ -723,7 +723,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
 		}
 
 		/*
-		 *	search miss
+		 *      search miss
 		 *
 		 * base is the smallest index with key (Kj) greater than
 		 * search key (K) and may be zero or (maxindex + 1) index.
@@ -834,7 +834,7 @@ int dtInsert(tid_t tid, struct inode *ip,
 	struct lv *lv;
 
 	/*
-	 *	retrieve search result
+	 *      retrieve search result
 	 *
 	 * dtSearch() returns (leaf page pinned, index at which to insert).
 	 * n.b. dtSearch() may return index of (maxindex + 1) of
@@ -843,7 +843,7 @@ int dtInsert(tid_t tid, struct inode *ip,
 	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
 
 	/*
-	 *	insert entry for new key
+	 *      insert entry for new key
 	 */
 	if (DO_INDEX(ip)) {
 		if (JFS_IP(ip)->next_index == DIREND) {
@@ -860,9 +860,9 @@ int dtInsert(tid_t tid, struct inode *ip,
 	data.leaf.ino = *fsn;
 
 	/*
-	 *	leaf page does not have enough room for new entry:
+	 *      leaf page does not have enough room for new entry:
 	 *
-	 *	extend/split the leaf page;
+	 *      extend/split the leaf page;
 	 *
 	 * dtSplitUp() will insert the entry and unpin the leaf page.
 	 */
@@ -877,9 +877,9 @@ int dtInsert(tid_t tid, struct inode *ip,
 	}
 
 	/*
-	 *	leaf page does have enough room for new entry:
+	 *      leaf page does have enough room for new entry:
 	 *
-	 *	insert the new data entry into the leaf page;
+	 *      insert the new data entry into the leaf page;
 	 */
 	BT_MARK_DIRTY(mp, ip);
 	/*
@@ -967,13 +967,13 @@ static int dtSplitUp(tid_t tid,
 	}
 
 	/*
-	 *	split leaf page
+	 *      split leaf page
 	 *
 	 * The split routines insert the new entry, and
 	 * acquire txLock as appropriate.
 	 */
 	/*
-	 *	split root leaf page:
+	 *      split root leaf page:
 	 */
 	if (sp->header.flag & BT_ROOT) {
 		/*
@@ -1012,7 +1012,7 @@ static int dtSplitUp(tid_t tid,
 	}
 
 	/*
-	 *	extend first leaf page
+	 *      extend first leaf page
 	 *
 	 * extend the 1st extent if less than buffer page size
 	 * (dtExtendPage() reurns leaf page unpinned)
@@ -1068,7 +1068,7 @@ static int dtSplitUp(tid_t tid,
 	}
 
 	/*
-	 *	split leaf page <sp> into <sp> and a new right page <rp>.
+	 *      split leaf page <sp> into <sp> and a new right page <rp>.
 	 *
 	 * return <rp> pinned and its extent descriptor <rpxd>
 	 */
@@ -1433,7 +1433,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 	rp->header.freecnt = rp->header.maxslot - fsi;
 
 	/*
-	 *	sequential append at tail: append without split
+	 *      sequential append at tail: append without split
 	 *
 	 * If splitting the last page on a level because of appending
 	 * a entry to it (skip is maxentry), it's likely that the access is
@@ -1467,7 +1467,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 	}
 
 	/*
-	 *	non-sequential insert (at possibly middle page)
+	 *      non-sequential insert (at possibly middle page)
 	 */
 
 	/*
@@ -1508,7 +1508,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 	left = 0;
 
 	/*
-	 *	compute fill factor for split pages
+	 *      compute fill factor for split pages
 	 *
 	 * <nxt> traces the next entry to move to rp
 	 * <off> traces the next entry to stay in sp
@@ -1551,7 +1551,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 	/* <nxt> poins to the 1st entry to move */
 
 	/*
-	 *	move entries to right page
+	 *      move entries to right page
 	 *
 	 * dtMoveEntry() initializes rp and reserves entry for insertion
 	 *
@@ -1677,7 +1677,7 @@ static int dtExtendPage(tid_t tid,
 		return (rc);
 
 	/*
-	 *	extend the extent
+	 *      extend the extent
 	 */
 	pxdlist = split->pxdlist;
 	pxd = &pxdlist->pxd[pxdlist->npxd];
@@ -1722,7 +1722,7 @@ static int dtExtendPage(tid_t tid,
 	}
 
 	/*
-	 *	extend the page
+	 *      extend the page
 	 */
 	sp->header.self = *pxd;
 
@@ -1739,6 +1739,9 @@ static int dtExtendPage(tid_t tid,
 	/* update buffer extent descriptor of extended page */
 	xlen = lengthPXD(pxd);
 	xsize = xlen << JFS_SBI(sb)->l2bsize;
+#ifdef _STILL_TO_PORT
+	bmSetXD(smp, xaddr, xsize);
+#endif				/*  _STILL_TO_PORT */
 
 	/*
 	 * copy old stbl to new stbl at start of extended area
@@ -1833,7 +1836,7 @@ static int dtExtendPage(tid_t tid,
 	}
 
 	/*
-	 *	update parent entry on the parent/root page
+	 *      update parent entry on the parent/root page
 	 */
 	/*
 	 * acquire a transaction lock on the parent/root page
@@ -1901,7 +1904,7 @@ static int dtSplitRoot(tid_t tid,
 	sp = &JFS_IP(ip)->i_dtroot;
 
 	/*
-	 *	allocate/initialize a single (right) child page
+	 *      allocate/initialize a single (right) child page
 	 *
 	 * N.B. at first split, a one (or two) block to fit new entry
 	 * is allocated; at subsequent split, a full page is allocated;
@@ -1940,7 +1943,7 @@ static int dtSplitRoot(tid_t tid,
 	rp->header.prev = 0;
 
 	/*
-	 *	move in-line root page into new right page extent
+	 *      move in-line root page into new right page extent
 	 */
 	/* linelock header + copied entries + new stbl (1st slot) in new page */
 	ASSERT(dtlck->index == 0);
@@ -2013,7 +2016,7 @@ static int dtSplitRoot(tid_t tid,
 	dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
 
 	/*
-	 *	reset parent/root page
+	 *      reset parent/root page
 	 *
 	 * set the 1st entry offset to 0, which force the left-most key
 	 * at any level of the tree to be less than any search key.
@@ -2099,7 +2102,7 @@ int dtDelete(tid_t tid,
 	dtpage_t *np;
 
 	/*
-	 *	search for the entry to delete:
+	 *      search for the entry to delete:
 	 *
 	 * dtSearch() returns (leaf page pinned, index at which to delete).
 	 */
@@ -2250,7 +2253,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 	int i;
 
 	/*
-	 *	keep the root leaf page which has become empty
+	 *      keep the root leaf page which has become empty
 	 */
 	if (BT_IS_ROOT(fmp)) {
 		/*
@@ -2266,7 +2269,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 	}
 
 	/*
-	 *	free the non-root leaf page
+	 *      free the non-root leaf page
 	 */
 	/*
 	 * acquire a transaction lock on the page
@@ -2296,7 +2299,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 	discard_metapage(fmp);
 
 	/*
-	 *	propagate page deletion up the directory tree
+	 *      propagate page deletion up the directory tree
 	 *
 	 * If the delete from the parent page makes it empty,
 	 * continue all the way up the tree.
@@ -2437,10 +2440,10 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 
 #ifdef _NOTYET
 /*
- * NAME:	dtRelocate()
+ * NAME:        dtRelocate()
  *
- * FUNCTION:	relocate dtpage (internal or leaf) of directory;
- *		This function is mainly used by defragfs utility.
+ * FUNCTION:    relocate dtpage (internal or leaf) of directory;
+ *              This function is mainly used by defragfs utility.
  */
 int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
 	       s64 nxaddr)
@@ -2468,8 +2471,8 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
 		   xlen);
 
 	/*
-	 *	1. get the internal parent dtpage covering
-	 *	router entry for the tartget page to be relocated;
+	 *      1. get the internal parent dtpage covering
+	 *      router entry for the tartget page to be relocated;
 	 */
 	rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
 	if (rc)
@@ -2480,7 +2483,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
 	jfs_info("dtRelocate: parent router entry validated.");
 
 	/*
-	 *	2. relocate the target dtpage
+	 *      2. relocate the target dtpage
 	 */
 	/* read in the target page from src extent */
 	DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
@@ -2578,7 +2581,9 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
 
 	/* update the buffer extent descriptor of the dtpage */
 	xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
-
+#ifdef _STILL_TO_PORT
+	bmSetXD(mp, nxaddr, xsize);
+#endif /* _STILL_TO_PORT */
 	/* unpin the relocated page */
 	DT_PUTPAGE(mp);
 	jfs_info("dtRelocate: target dtpage relocated.");
@@ -2589,7 +2594,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
 	 */
 
 	/*
-	 *	3. acquire maplock for the source extent to be freed;
+	 *      3. acquire maplock for the source extent to be freed;
 	 */
 	/* for dtpage relocation, write a LOG_NOREDOPAGE record
 	 * for the source dtpage (logredo() will init NoRedoPage
@@ -2604,7 +2609,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
 	pxdlock->index = 1;
 
 	/*
-	 *	4. update the parent router entry for relocation;
+	 *      4. update the parent router entry for relocation;
 	 *
 	 * acquire tlck for the parent entry covering the target dtpage;
 	 * write LOG_REDOPAGE to apply after image only;
@@ -2632,7 +2637,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
  * NAME:	dtSearchNode()
  *
  * FUNCTION:	Search for an dtpage containing a specified address
- *		This function is mainly used by defragfs utility.
+ *              This function is mainly used by defragfs utility.
  *
  * NOTE:	Search result on stack, the found page is pinned at exit.
  *		The result page must be an internal dtpage.
@@ -2655,7 +2660,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
 	BT_CLR(btstack);	/* reset stack */
 
 	/*
-	 *	descend tree to the level with specified leftmost page
+	 *      descend tree to the level with specified leftmost page
 	 *
 	 *  by convention, root bn = 0.
 	 */
@@ -2694,7 +2699,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
 	}
 
 	/*
-	 *	search each page at the current levevl
+	 *      search each page at the current levevl
 	 */
       loop:
 	stbl = DT_GETSTBL(p);
@@ -3039,9 +3044,9 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (DO_INDEX(ip)) {
 		/*
 		 * persistent index is stored in directory entries.
-		 * Special cases:	 0 = .
-		 *			 1 = ..
-		 *			-1 = End of directory
+		 * Special cases:        0 = .
+		 *                       1 = ..
+		 *                      -1 = End of directory
 		 */
 		do_index = 1;
 
@@ -3123,10 +3128,10 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		/*
 		 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
 		 *
-		 * pn = index = 0:	First entry "."
-		 * pn = 0; index = 1:	Second entry ".."
-		 * pn > 0:		Real entries, pn=1 -> leftmost page
-		 * pn = index = -1:	No more entries
+		 * pn = index = 0:      First entry "."
+		 * pn = 0; index = 1:   Second entry ".."
+		 * pn > 0:              Real entries, pn=1 -> leftmost page
+		 * pn = index = -1:     No more entries
 		 */
 		dtpos = filp->f_pos;
 		if (dtpos == 0) {
@@ -3346,7 +3351,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
 	BT_CLR(btstack);	/* reset stack */
 
 	/*
-	 *	descend leftmost path of the tree
+	 *      descend leftmost path of the tree
 	 *
 	 * by convention, root bn = 0.
 	 */
@@ -4526,7 +4531,7 @@ int dtModify(tid_t tid, struct inode *ip,
 	struct ldtentry *entry;
 
 	/*
-	 *	search for the entry to modify:
+	 *      search for the entry to modify:
 	 *
 	 * dtSearch() returns (leaf page pinned, index at which to modify).
 	 */
diff --git a/trunk/fs/jfs/jfs_dtree.h b/trunk/fs/jfs/jfs_dtree.h
index 8561c6ecece0..af8513f78648 100644
--- a/trunk/fs/jfs/jfs_dtree.h
+++ b/trunk/fs/jfs/jfs_dtree.h
@@ -35,7 +35,7 @@ typedef union {
 
 
 /*
- *	entry segment/slot
+ *      entry segment/slot
  *
  * an entry consists of type dependent head/only segment/slot and
  * additional segments/slots linked vi next field;
diff --git a/trunk/fs/jfs/jfs_extent.c b/trunk/fs/jfs/jfs_extent.c
index 7ae1e3281de9..a35bdca6a805 100644
--- a/trunk/fs/jfs/jfs_extent.c
+++ b/trunk/fs/jfs/jfs_extent.c
@@ -34,8 +34,8 @@ static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
 #endif
 static s64 extRoundDown(s64 nb);
 
-#define DPD(a)		(printk("(a): %d\n",(a)))
-#define DPC(a)		(printk("(a): %c\n",(a)))
+#define DPD(a)          (printk("(a): %d\n",(a)))
+#define DPC(a)          (printk("(a): %c\n",(a)))
 #define DPL1(a)					\
 {						\
 	if ((a) >> 32)				\
@@ -51,19 +51,19 @@ static s64 extRoundDown(s64 nb);
 		printk("(a): %x\n",(a) << 32);	\
 }
 
-#define DPD1(a)		(printk("(a): %d  ",(a)))
-#define DPX(a)		(printk("(a): %08x\n",(a)))
-#define DPX1(a)		(printk("(a): %08x  ",(a)))
-#define DPS(a)		(printk("%s\n",(a)))
-#define DPE(a)		(printk("\nENTERING: %s\n",(a)))
-#define DPE1(a)		(printk("\nENTERING: %s",(a)))
-#define DPS1(a)		(printk("  %s  ",(a)))
+#define DPD1(a)         (printk("(a): %d  ",(a)))
+#define DPX(a)          (printk("(a): %08x\n",(a)))
+#define DPX1(a)         (printk("(a): %08x  ",(a)))
+#define DPS(a)          (printk("%s\n",(a)))
+#define DPE(a)          (printk("\nENTERING: %s\n",(a)))
+#define DPE1(a)          (printk("\nENTERING: %s",(a)))
+#define DPS1(a)         (printk("  %s  ",(a)))
 
 
 /*
  * NAME:	extAlloc()
  *
- * FUNCTION:	allocate an extent for a specified page range within a
+ * FUNCTION:    allocate an extent for a specified page range within a
  *		file.
  *
  * PARAMETERS:
@@ -78,9 +78,9 @@ static s64 extRoundDown(s64 nb);
  *		  should be marked as allocated but not recorded.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
- *	-ENOSPC	- insufficient disk resources.
+ *      0       - success
+ *      -EIO	- i/o error.
+ *      -ENOSPC	- insufficient disk resources.
  */
 int
 extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
@@ -192,9 +192,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 
 #ifdef _NOTYET
 /*
- * NAME:	extRealloc()
+ * NAME:        extRealloc()
  *
- * FUNCTION:	extend the allocation of a file extent containing a
+ * FUNCTION:    extend the allocation of a file extent containing a
  *		partial back last page.
  *
  * PARAMETERS:
@@ -207,9 +207,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
  *		  should be marked as allocated but not recorded.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
- *	-ENOSPC	- insufficient disk resources.
+ *      0       - success
+ *      -EIO	- i/o error.
+ *      -ENOSPC	- insufficient disk resources.
  */
 int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 {
@@ -345,9 +345,9 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 
 
 /*
- * NAME:	extHint()
+ * NAME:        extHint()
  *
- * FUNCTION:	produce an extent allocation hint for a file offset.
+ * FUNCTION:    produce an extent allocation hint for a file offset.
  *
  * PARAMETERS:
  *	ip	- the inode of the file.
@@ -356,8 +356,8 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
  *		  the hint.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
+ *      0       - success
+ *      -EIO	- i/o error.
  */
 int extHint(struct inode *ip, s64 offset, xad_t * xp)
 {
@@ -387,7 +387,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
 	lxdl.nlxd = 1;
 	lxdl.lxd = &lxd;
 	LXDoffset(&lxd, prev)
-	LXDlength(&lxd, nbperpage);
+	    LXDlength(&lxd, nbperpage);
 
 	xadl.maxnxad = 1;
 	xadl.nxad = 0;
@@ -397,11 +397,11 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
 	if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
 		return (rc);
 
-	/* check if no extent exists for the previous page.
+	/* check if not extent exists for the previous page.
 	 * this is possible for sparse files.
 	 */
 	if (xadl.nxad == 0) {
-//		assert(ISSPARSE(ip));
+//              assert(ISSPARSE(ip));
 		return (0);
 	}
 
@@ -410,28 +410,28 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
 	 */
 	xp->flag &= XAD_NOTRECORDED;
 
-	if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {
+        if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {
 		jfs_error(ip->i_sb, "extHint: corrupt xtree");
 		return -EIO;
-	}
+        }
 
 	return (0);
 }
 
 
 /*
- * NAME:	extRecord()
+ * NAME:        extRecord()
  *
- * FUNCTION:	change a page with a file from not recorded to recorded.
+ * FUNCTION:    change a page with a file from not recorded to recorded.
  *
  * PARAMETERS:
  *	ip	- inode of the file.
  *	cp	- cbuf of the file page.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
- *	-ENOSPC	- insufficient disk resources.
+ *      0       - success
+ *      -EIO	- i/o error.
+ *      -ENOSPC	- insufficient disk resources.
  */
 int extRecord(struct inode *ip, xad_t * xp)
 {
@@ -451,9 +451,9 @@ int extRecord(struct inode *ip, xad_t * xp)
 
 #ifdef _NOTYET
 /*
- * NAME:	extFill()
+ * NAME:        extFill()
  *
- * FUNCTION:	allocate disk space for a file page that represents
+ * FUNCTION:    allocate disk space for a file page that represents
  *		a file hole.
  *
  * PARAMETERS:
@@ -461,16 +461,16 @@ int extRecord(struct inode *ip, xad_t * xp)
  *	cp	- cbuf of the file page represent the hole.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
- *	-ENOSPC	- insufficient disk resources.
+ *      0       - success
+ *      -EIO	- i/o error.
+ *      -ENOSPC	- insufficient disk resources.
  */
 int extFill(struct inode *ip, xad_t * xp)
 {
 	int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
 	s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
 
-//	assert(ISSPARSE(ip));
+//      assert(ISSPARSE(ip));
 
 	/* initialize the extent allocation hint */
 	XADaddress(xp, 0);
@@ -489,7 +489,7 @@ int extFill(struct inode *ip, xad_t * xp)
 /*
  * NAME:	extBalloc()
  *
- * FUNCTION:	allocate disk blocks to form an extent.
+ * FUNCTION:    allocate disk blocks to form an extent.
  *
  *		initially, we will try to allocate disk blocks for the
  *		requested size (nblocks).  if this fails (nblocks
@@ -513,9 +513,9 @@ int extFill(struct inode *ip, xad_t * xp)
  *		   allocated block range.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
- *	-ENOSPC	- insufficient disk resources.
+ *      0       - success
+ *      -EIO	- i/o error.
+ *      -ENOSPC	- insufficient disk resources.
  */
 static int
 extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
@@ -580,7 +580,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 /*
  * NAME:	extBrealloc()
  *
- * FUNCTION:	attempt to extend an extent's allocation.
+ * FUNCTION:    attempt to extend an extent's allocation.
  *
  *		Initially, we will try to extend the extent's allocation
  *		in place.  If this fails, we'll try to move the extent
@@ -597,8 +597,8 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
  *
  * PARAMETERS:
  *	ip	 - the inode of the file.
- *	blkno	 - starting block number of the extents current allocation.
- *	nblks	 - number of blocks within the extents current allocation.
+ *	blkno    - starting block number of the extents current allocation.
+ *	nblks    - number of blocks within the extents current allocation.
  *	newnblks - pointer to a s64 value.  on entry, this value is the
  *		   the new desired extent size (number of blocks).  on
  *		   successful exit, this value is set to the extent's actual
@@ -606,9 +606,9 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
  *	newblkno - the starting block number of the extents new allocation.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
- *	-ENOSPC	- insufficient disk resources.
+ *      0       - success
+ *      -EIO	- i/o error.
+ *      -ENOSPC	- insufficient disk resources.
  */
 static int
 extBrealloc(struct inode *ip,
@@ -634,16 +634,16 @@ extBrealloc(struct inode *ip,
 
 
 /*
- * NAME:	extRoundDown()
+ * NAME:        extRoundDown()
  *
- * FUNCTION:	round down a specified number of blocks to the next
+ * FUNCTION:    round down a specified number of blocks to the next
  *		smallest power of 2 number.
  *
  * PARAMETERS:
  *	nb	- the inode of the file.
  *
  * RETURN VALUES:
- *	next smallest power of 2 number.
+ *      next smallest power of 2 number.
  */
 static s64 extRoundDown(s64 nb)
 {
diff --git a/trunk/fs/jfs/jfs_filsys.h b/trunk/fs/jfs/jfs_filsys.h
index b3f5463fbe52..38f70ac03bec 100644
--- a/trunk/fs/jfs/jfs_filsys.h
+++ b/trunk/fs/jfs/jfs_filsys.h
@@ -34,9 +34,9 @@
 #define JFS_UNICODE	0x00000001	/* unicode name */
 
 /* mount time flags for error handling */
-#define JFS_ERR_REMOUNT_RO 0x00000002	/* remount read-only */
-#define JFS_ERR_CONTINUE   0x00000004	/* continue */
-#define JFS_ERR_PANIC      0x00000008	/* panic */
+#define JFS_ERR_REMOUNT_RO 0x00000002   /* remount read-only */
+#define JFS_ERR_CONTINUE   0x00000004   /* continue */
+#define JFS_ERR_PANIC      0x00000008   /* panic */
 
 /* Quota support */
 #define	JFS_USRQUOTA	0x00000010
@@ -83,6 +83,7 @@
 /*	case-insensitive name/directory support */
 
 #define JFS_AIX		0x80000000	/* AIX support */
+/*	POSIX name/directory  support - Never implemented*/
 
 /*
  *	buffer cache configuration
@@ -112,10 +113,10 @@
 #define IDATASIZE	256	/* inode inline data size */
 #define	IXATTRSIZE	128	/* inode inline extended attribute size */
 
-#define XTPAGE_SIZE	4096
-#define log2_PAGESIZE	12
+#define XTPAGE_SIZE     4096
+#define log2_PAGESIZE     12
 
-#define IAG_SIZE	4096
+#define IAG_SIZE        4096
 #define IAG_EXTENT_SIZE 4096
 #define	INOSPERIAG	4096	/* number of disk inodes per iag */
 #define	L2INOSPERIAG	12	/* l2 number of disk inodes per iag */
diff --git a/trunk/fs/jfs/jfs_imap.c b/trunk/fs/jfs/jfs_imap.c
index 3870ba8b9086..c6530227cda6 100644
--- a/trunk/fs/jfs/jfs_imap.c
+++ b/trunk/fs/jfs/jfs_imap.c
@@ -93,21 +93,21 @@ static int copy_from_dinode(struct dinode *, struct inode *);
 static void copy_to_dinode(struct dinode *, struct inode *);
 
 /*
- * NAME:	diMount()
+ * NAME:        diMount()
  *
- * FUNCTION:	initialize the incore inode map control structures for
+ * FUNCTION:    initialize the incore inode map control structures for
  *		a fileset or aggregate init time.
  *
- *		the inode map's control structure (dinomap) is
- *		brought in from disk and placed in virtual memory.
+ *              the inode map's control structure (dinomap) is
+ *              brought in from disk and placed in virtual memory.
  *
  * PARAMETERS:
- *	ipimap	- pointer to inode map inode for the aggregate or fileset.
+ *      ipimap  - pointer to inode map inode for the aggregate or fileset.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOMEM	- insufficient free virtual memory.
- *	-EIO	- i/o error.
+ *      0       - success
+ *      -ENOMEM  - insufficient free virtual memory.
+ *      -EIO	- i/o error.
  */
 int diMount(struct inode *ipimap)
 {
@@ -180,18 +180,18 @@ int diMount(struct inode *ipimap)
 
 
 /*
- * NAME:	diUnmount()
+ * NAME:        diUnmount()
  *
- * FUNCTION:	write to disk the incore inode map control structures for
+ * FUNCTION:    write to disk the incore inode map control structures for
  *		a fileset or aggregate at unmount time.
  *
  * PARAMETERS:
- *	ipimap	- pointer to inode map inode for the aggregate or fileset.
+ *      ipimap  - pointer to inode map inode for the aggregate or fileset.
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOMEM	- insufficient free virtual memory.
- *	-EIO	- i/o error.
+ *      0       - success
+ *      -ENOMEM  - insufficient free virtual memory.
+ *      -EIO	- i/o error.
  */
 int diUnmount(struct inode *ipimap, int mounterror)
 {
@@ -274,9 +274,9 @@ int diSync(struct inode *ipimap)
 
 
 /*
- * NAME:	diRead()
+ * NAME:        diRead()
  *
- * FUNCTION:	initialize an incore inode from disk.
+ * FUNCTION:    initialize an incore inode from disk.
  *
  *		on entry, the specifed incore inode should itself
  *		specify the disk inode number corresponding to the
@@ -285,7 +285,7 @@ int diSync(struct inode *ipimap)
  *		this routine handles incore inode initialization for
  *		both "special" and "regular" inodes.  special inodes
  *		are those required early in the mount process and
- *		require special handling since much of the file system
+ *	        require special handling since much of the file system
  *		is not yet initialized.  these "special" inodes are
  *		identified by a NULL inode map inode pointer and are
  *		actually initialized by a call to diReadSpecial().
@@ -298,12 +298,12 @@ int diSync(struct inode *ipimap)
  *		incore inode.
  *
  * PARAMETERS:
- *	ip	-  pointer to incore inode to be initialized from disk.
+ *      ip  -  pointer to incore inode to be initialized from disk.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
- *	-ENOMEM	- insufficient memory
+ *      0       - success
+ *      -EIO	- i/o error.
+ *      -ENOMEM	- insufficient memory
  *
  */
 int diRead(struct inode *ip)
@@ -410,26 +410,26 @@ int diRead(struct inode *ip)
 
 
 /*
- * NAME:	diReadSpecial()
+ * NAME:        diReadSpecial()
  *
- * FUNCTION:	initialize a 'special' inode from disk.
+ * FUNCTION:    initialize a 'special' inode from disk.
  *
  *		this routines handles aggregate level inodes.  The
  *		inode cache cannot differentiate between the
  *		aggregate inodes and the filesystem inodes, so we
  *		handle these here.  We don't actually use the aggregate
- *		inode map, since these inodes are at a fixed location
+ *	        inode map, since these inodes are at a fixed location
  *		and in some cases the aggregate inode map isn't initialized
  *		yet.
  *
  * PARAMETERS:
- *	sb - filesystem superblock
+ *      sb - filesystem superblock
  *	inum - aggregate inode number
  *	secondary - 1 if secondary aggregate inode table
  *
  * RETURN VALUES:
- *	new inode	- success
- *	NULL		- i/o error.
+ *      new inode	- success
+ *      NULL		- i/o error.
  */
 struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 {
@@ -502,12 +502,12 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 }
 
 /*
- * NAME:	diWriteSpecial()
+ * NAME:        diWriteSpecial()
  *
- * FUNCTION:	Write the special inode to disk
+ * FUNCTION:    Write the special inode to disk
  *
  * PARAMETERS:
- *	ip - special inode
+ *      ip - special inode
  *	secondary - 1 if secondary aggregate inode table
  *
  * RETURN VALUES: none
@@ -554,9 +554,9 @@ void diWriteSpecial(struct inode *ip, int secondary)
 }
 
 /*
- * NAME:	diFreeSpecial()
+ * NAME:        diFreeSpecial()
  *
- * FUNCTION:	Free allocated space for special inode
+ * FUNCTION:    Free allocated space for special inode
  */
 void diFreeSpecial(struct inode *ip)
 {
@@ -572,9 +572,9 @@ void diFreeSpecial(struct inode *ip)
 
 
 /*
- * NAME:	diWrite()
+ * NAME:        diWrite()
  *
- * FUNCTION:	write the on-disk inode portion of the in-memory inode
+ * FUNCTION:    write the on-disk inode portion of the in-memory inode
  *		to its corresponding on-disk inode.
  *
  *		on entry, the specifed incore inode should itself
@@ -589,11 +589,11 @@ void diFreeSpecial(struct inode *ip)
  *
  * PARAMETERS:
  *	tid -  transacation id
- *	ip  -  pointer to incore inode to be written to the inode extent.
+ *      ip  -  pointer to incore inode to be written to the inode extent.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
+ *      0       - success
+ *      -EIO	- i/o error.
  */
 int diWrite(tid_t tid, struct inode *ip)
 {
@@ -730,7 +730,7 @@ int diWrite(tid_t tid, struct inode *ip)
 	ilinelock = (struct linelock *) & tlck->lock;
 
 	/*
-	 *	regular file: 16 byte (XAD slot) granularity
+	 *      regular file: 16 byte (XAD slot) granularity
 	 */
 	if (type & tlckXTREE) {
 		xtpage_t *p, *xp;
@@ -755,7 +755,7 @@ int diWrite(tid_t tid, struct inode *ip)
 				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
 	}
 	/*
-	 *	directory: 32 byte (directory entry slot) granularity
+	 *      directory: 32 byte (directory entry slot) granularity
 	 */
 	else if (type & tlckDTREE) {
 		dtpage_t *p, *xp;
@@ -800,8 +800,9 @@ int diWrite(tid_t tid, struct inode *ip)
 	}
 
 	/*
-	 *	lock/copy inode base: 128 byte slot granularity
+	 *      lock/copy inode base: 128 byte slot granularity
 	 */
+// baseDinode:
 	lv = & dilinelock->lv[dilinelock->index];
 	lv->offset = dioffset >> L2INODESLOTSIZE;
 	copy_to_dinode(dp, ip);
@@ -812,6 +813,17 @@ int diWrite(tid_t tid, struct inode *ip)
 		lv->length = 1;
 	dilinelock->index++;
 
+#ifdef _JFS_FASTDASD
+	/*
+	 * We aren't logging changes to the DASD used in directory inodes,
+	 * but we need to write them to disk.  If we don't unmount cleanly,
+	 * mount will recalculate the DASD used.
+	 */
+	if (S_ISDIR(ip->i_mode)
+	    && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED))
+		memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd));
+#endif				/*  _JFS_FASTDASD */
+
 	/* release the buffer holding the updated on-disk inode.
 	 * the buffer will be later written by commit processing.
 	 */
@@ -822,9 +834,9 @@ int diWrite(tid_t tid, struct inode *ip)
 
 
 /*
- * NAME:	diFree(ip)
+ * NAME:        diFree(ip)
  *
- * FUNCTION:	free a specified inode from the inode working map
+ * FUNCTION:    free a specified inode from the inode working map
  *		for a fileset or aggregate.
  *
  *		if the inode to be freed represents the first (only)
@@ -853,11 +865,11 @@ int diWrite(tid_t tid, struct inode *ip)
  *		any updates and are held until all updates are complete.
  *
  * PARAMETERS:
- *	ip	- inode to be freed.
+ *      ip	- inode to be freed.
  *
  * RETURN VALUES:
- *	0	- success
- *	-EIO	- i/o error.
+ *      0       - success
+ *      -EIO	- i/o error.
  */
 int diFree(struct inode *ip)
 {
@@ -890,8 +902,7 @@ int diFree(struct inode *ip)
 	 * the map.
 	 */
 	if (iagno >= imap->im_nextiag) {
-		print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
-			       imap, 32, 0);
+		dump_mem("imap", imap, 32);
 		jfs_error(ip->i_sb,
 			  "diFree: inum = %d, iagno = %d, nextiag = %d",
 			  (uint) inum, iagno, imap->im_nextiag);
@@ -953,8 +964,8 @@ int diFree(struct inode *ip)
 		return -EIO;
 	}
 	/*
-	 *	inode extent still has some inodes or below low water mark:
-	 *	keep the inode extent;
+	 *      inode extent still has some inodes or below low water mark:
+	 *      keep the inode extent;
 	 */
 	if (bitmap ||
 	    imap->im_agctl[agno].numfree < 96 ||
@@ -1036,12 +1047,12 @@ int diFree(struct inode *ip)
 
 
 	/*
-	 *	inode extent has become free and above low water mark:
-	 *	free the inode extent;
+	 *      inode extent has become free and above low water mark:
+	 *      free the inode extent;
 	 */
 
 	/*
-	 *	prepare to update iag list(s) (careful update step 1)
+	 *      prepare to update iag list(s) (careful update step 1)
 	 */
 	amp = bmp = cmp = dmp = NULL;
 	fwd = back = -1;
@@ -1141,7 +1152,7 @@ int diFree(struct inode *ip)
 	invalidate_pxd_metapages(ip, freepxd);
 
 	/*
-	 *	update iag list(s) (careful update step 2)
+	 *      update iag list(s) (careful update step 2)
 	 */
 	/* add the iag to the ag extent free list if this is the
 	 * first free extent for the iag.
@@ -1327,20 +1338,20 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
 
 
 /*
- * NAME:	diAlloc(pip,dir,ip)
+ * NAME:        diAlloc(pip,dir,ip)
  *
- * FUNCTION:	allocate a disk inode from the inode working map
+ * FUNCTION:    allocate a disk inode from the inode working map
  *		for a fileset or aggregate.
  *
  * PARAMETERS:
- *	pip	- pointer to incore inode for the parent inode.
- *	dir	- 'true' if the new disk inode is for a directory.
- *	ip	- pointer to a new inode
+ *      pip	- pointer to incore inode for the parent inode.
+ *      dir	- 'true' if the new disk inode is for a directory.
+ *      ip	- pointer to a new inode
  *
  * RETURN VALUES:
- *	0	- success.
- *	-ENOSPC	- insufficient disk resources.
- *	-EIO	- i/o error.
+ *      0       - success.
+ *      -ENOSPC	- insufficient disk resources.
+ *      -EIO	- i/o error.
  */
 int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 {
@@ -1422,7 +1433,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 	addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
 
 	/*
-	 *	try to allocate from the IAG
+	 *      try to allocate from the IAG
 	 */
 	/* check if the inode may be allocated from the iag
 	 * (i.e. the inode has free inodes or new extent can be added).
@@ -1622,9 +1633,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 
 
 /*
- * NAME:	diAllocAG(imap,agno,dir,ip)
+ * NAME:        diAllocAG(imap,agno,dir,ip)
  *
- * FUNCTION:	allocate a disk inode from the allocation group.
+ * FUNCTION:    allocate a disk inode from the allocation group.
  *
  *		this routine first determines if a new extent of free
  *		inodes should be added for the allocation group, with
@@ -1638,17 +1649,17 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
  * PRE CONDITION: Already have the AG lock for this AG.
  *
  * PARAMETERS:
- *	imap	- pointer to inode map control structure.
- *	agno	- allocation group to allocate from.
- *	dir	- 'true' if the new disk inode is for a directory.
- *	ip	- pointer to the new inode to be filled in on successful return
+ *      imap	- pointer to inode map control structure.
+ *      agno	- allocation group to allocate from.
+ *      dir	- 'true' if the new disk inode is for a directory.
+ *      ip	- pointer to the new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
- *	0	- success.
- *	-ENOSPC	- insufficient disk resources.
- *	-EIO	- i/o error.
+ *      0       - success.
+ *      -ENOSPC	- insufficient disk resources.
+ *      -EIO	- i/o error.
  */
 static int
 diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
@@ -1698,9 +1709,9 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
 
 
 /*
- * NAME:	diAllocAny(imap,agno,dir,iap)
+ * NAME:        diAllocAny(imap,agno,dir,iap)
  *
- * FUNCTION:	allocate a disk inode from any other allocation group.
+ * FUNCTION:    allocate a disk inode from any other allocation group.
  *
  *		this routine is called when an allocation attempt within
  *		the primary allocation group has failed. if attempts to
@@ -1708,17 +1719,17 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
  *		specified primary group.
  *
  * PARAMETERS:
- *	imap	- pointer to inode map control structure.
- *	agno	- primary allocation group (to avoid).
- *	dir	- 'true' if the new disk inode is for a directory.
- *	ip	- pointer to a new inode to be filled in on successful return
+ *      imap	- pointer to inode map control structure.
+ *      agno	- primary allocation group (to avoid).
+ *      dir	- 'true' if the new disk inode is for a directory.
+ *      ip	- pointer to a new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
- *	0	- success.
- *	-ENOSPC	- insufficient disk resources.
- *	-EIO	- i/o error.
+ *      0       - success.
+ *      -ENOSPC	- insufficient disk resources.
+ *      -EIO	- i/o error.
  */
 static int
 diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
@@ -1761,9 +1772,9 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
 
 
 /*
- * NAME:	diAllocIno(imap,agno,ip)
+ * NAME:        diAllocIno(imap,agno,ip)
  *
- * FUNCTION:	allocate a disk inode from the allocation group's free
+ * FUNCTION:    allocate a disk inode from the allocation group's free
  *		inode list, returning an error if this free list is
  *		empty (i.e. no iags on the list).
  *
@@ -1774,16 +1785,16 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
  * PRE CONDITION: Already have AG lock for this AG.
  *
  * PARAMETERS:
- *	imap	- pointer to inode map control structure.
- *	agno	- allocation group.
- *	ip	- pointer to new inode to be filled in on successful return
+ *      imap	- pointer to inode map control structure.
+ *      agno	- allocation group.
+ *      ip	- pointer to new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
- *	0	- success.
- *	-ENOSPC	- insufficient disk resources.
- *	-EIO	- i/o error.
+ *      0       - success.
+ *      -ENOSPC	- insufficient disk resources.
+ *      -EIO	- i/o error.
  */
 static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 {
@@ -1879,7 +1890,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 
 
 /*
- * NAME:	diAllocExt(imap,agno,ip)
+ * NAME:        diAllocExt(imap,agno,ip)
  *
  * FUNCTION:	add a new extent of free inodes to an iag, allocating
  *		an inode from this extent to satisfy the current allocation
@@ -1899,16 +1910,16 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
  *		for the purpose of satisfying this request.
  *
  * PARAMETERS:
- *	imap	- pointer to inode map control structure.
- *	agno	- allocation group number.
- *	ip	- pointer to new inode to be filled in on successful return
+ *      imap	- pointer to inode map control structure.
+ *      agno	- allocation group number.
+ *      ip	- pointer to new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
- *	0	- success.
- *	-ENOSPC	- insufficient disk resources.
- *	-EIO	- i/o error.
+ *      0       - success.
+ *      -ENOSPC	- insufficient disk resources.
+ *      -EIO	- i/o error.
  */
 static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
 {
@@ -1999,7 +2010,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
 
 
 /*
- * NAME:	diAllocBit(imap,iagp,ino)
+ * NAME:        diAllocBit(imap,iagp,ino)
  *
  * FUNCTION:	allocate a backed inode from an iag.
  *
@@ -2019,14 +2030,14 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
  *	this AG.  Must have read lock on imap inode.
  *
  * PARAMETERS:
- *	imap	- pointer to inode map control structure.
- *	iagp	- pointer to iag.
- *	ino	- inode number to be allocated within the iag.
+ *      imap	- pointer to inode map control structure.
+ *      iagp	- pointer to iag.
+ *      ino	- inode number to be allocated within the iag.
  *
  * RETURN VALUES:
- *	0	- success.
- *	-ENOSPC	- insufficient disk resources.
- *	-EIO	- i/o error.
+ *      0       - success.
+ *      -ENOSPC	- insufficient disk resources.
+ *      -EIO	- i/o error.
  */
 static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
 {
@@ -2133,11 +2144,11 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
 
 
 /*
- * NAME:	diNewExt(imap,iagp,extno)
+ * NAME:        diNewExt(imap,iagp,extno)
  *
- * FUNCTION:	initialize a new extent of inodes for an iag, allocating
- *		the first inode of the extent for use for the current
- *		allocation request.
+ * FUNCTION:    initialize a new extent of inodes for an iag, allocating
+ *	        the first inode of the extent for use for the current
+ *	        allocation request.
  *
  *		disk resources are allocated for the new extent of inodes
  *		and the inodes themselves are initialized to reflect their
@@ -2166,14 +2177,14 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
  *	this AG.  Must have read lock on imap inode.
  *
  * PARAMETERS:
- *	imap	- pointer to inode map control structure.
- *	iagp	- pointer to iag.
- *	extno	- extent number.
+ *      imap	- pointer to inode map control structure.
+ *      iagp	- pointer to iag.
+ *      extno	- extent number.
  *
  * RETURN VALUES:
- *	0	- success.
- *	-ENOSPC	- insufficient disk resources.
- *	-EIO	- i/o error.
+ *      0       - success.
+ *      -ENOSPC	- insufficient disk resources.
+ *      -EIO	- i/o error.
  */
 static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 {
@@ -2419,7 +2430,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 
 
 /*
- * NAME:	diNewIAG(imap,iagnop,agno)
+ * NAME:        diNewIAG(imap,iagnop,agno)
  *
  * FUNCTION:	allocate a new iag for an allocation group.
  *
@@ -2432,16 +2443,16 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
  *		and returned to satisfy the request.
  *
  * PARAMETERS:
- *	imap	- pointer to inode map control structure.
- *	iagnop	- pointer to an iag number set with the number of the
+ *      imap	- pointer to inode map control structure.
+ *      iagnop	- pointer to an iag number set with the number of the
  *		  newly allocated iag upon successful return.
- *	agno	- allocation group number.
+ *      agno	- allocation group number.
  *	bpp	- Buffer pointer to be filled in with new IAG's buffer
  *
  * RETURN VALUES:
- *	0	- success.
- *	-ENOSPC	- insufficient disk resources.
- *	-EIO	- i/o error.
+ *      0       - success.
+ *      -ENOSPC	- insufficient disk resources.
+ *      -EIO	- i/o error.
  *
  * serialization:
  *	AG lock held on entry/exit;
@@ -2450,7 +2461,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
  *
  * note: new iag transaction:
  * . synchronously write iag;
- * . write log of xtree and inode of imap;
+ * . write log of xtree and inode  of imap;
  * . commit;
  * . synchronous write of xtree (right to left, bottom to top);
  * . at start of logredo(): init in-memory imap with one additional iag page;
@@ -2470,6 +2481,9 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 	s64 xaddr = 0;
 	s64 blkno;
 	tid_t tid;
+#ifdef _STILL_TO_PORT
+	xad_t xad;
+#endif				/*  _STILL_TO_PORT */
 	struct inode *iplist[1];
 
 	/* pick up pointers to the inode map and mount inodes */
@@ -2660,15 +2674,15 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 }
 
 /*
- * NAME:	diIAGRead()
+ * NAME:        diIAGRead()
  *
- * FUNCTION:	get the buffer for the specified iag within a fileset
+ * FUNCTION:    get the buffer for the specified iag within a fileset
  *		or aggregate inode map.
  *
  * PARAMETERS:
- *	imap	- pointer to inode map control structure.
- *	iagno	- iag number.
- *	bpp	- point to buffer pointer to be filled in on successful
+ *      imap	- pointer to inode map control structure.
+ *      iagno	- iag number.
+ *      bpp	- point to buffer pointer to be filled in on successful
  *		  exit.
  *
  * SERIALIZATION:
@@ -2677,8 +2691,8 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
  *	 the read lock is unnecessary.)
  *
  * RETURN VALUES:
- *	0	- success.
- *	-EIO	- i/o error.
+ *      0       - success.
+ *      -EIO	- i/o error.
  */
 static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
 {
@@ -2698,17 +2712,17 @@ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
 }
 
 /*
- * NAME:	diFindFree()
+ * NAME:        diFindFree()
  *
- * FUNCTION:	find the first free bit in a word starting at
+ * FUNCTION:    find the first free bit in a word starting at
  *		the specified bit position.
  *
  * PARAMETERS:
- *	word	- word to be examined.
- *	start	- starting bit position.
+ *      word	- word to be examined.
+ *      start	- starting bit position.
  *
  * RETURN VALUES:
- *	bit position of first free bit in the word or 32 if
+ *      bit position of first free bit in the word or 32 if
  *	no free bits were found.
  */
 static int diFindFree(u32 word, int start)
@@ -2883,7 +2897,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
 		   atomic_read(&imap->im_numfree));
 
 	/*
-	 *	reconstruct imap
+	 *      reconstruct imap
 	 *
 	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
 	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
@@ -2899,7 +2913,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
 	}
 
 	/*
-	 *	process each iag page of the map.
+	 *      process each iag page of the map.
 	 *
 	 * rebuild AG Free Inode List, AG Free Inode Extent List;
 	 */
@@ -2918,7 +2932,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
 
 		/* leave free iag in the free iag list */
 		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
-			release_metapage(bp);
+		        release_metapage(bp);
 			continue;
 		}
 
@@ -3049,13 +3063,13 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno,
 }
 
 /*
- * NAME:	copy_from_dinode()
+ * NAME:        copy_from_dinode()
  *
- * FUNCTION:	Copies inode info from disk inode to in-memory inode
+ * FUNCTION:    Copies inode info from disk inode to in-memory inode
  *
  * RETURN VALUES:
- *	0	- success
- *	-ENOMEM	- insufficient memory
+ *      0       - success
+ *      -ENOMEM	- insufficient memory
  */
 static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 {
@@ -3137,9 +3151,9 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 }
 
 /*
- * NAME:	copy_to_dinode()
+ * NAME:        copy_to_dinode()
  *
- * FUNCTION:	Copies inode info from in-memory inode to disk inode
+ * FUNCTION:    Copies inode info from in-memory inode to disk inode
  */
 static void copy_to_dinode(struct dinode * dip, struct inode *ip)
 {
diff --git a/trunk/fs/jfs/jfs_imap.h b/trunk/fs/jfs/jfs_imap.h
index 610a0e9d8941..4f9c346ed498 100644
--- a/trunk/fs/jfs/jfs_imap.h
+++ b/trunk/fs/jfs/jfs_imap.h
@@ -24,17 +24,17 @@
  *	jfs_imap.h: disk inode manager
  */
 
-#define	EXTSPERIAG	128	/* number of disk inode extent per iag	*/
-#define IMAPBLKNO	0	/* lblkno of dinomap within inode map	*/
-#define SMAPSZ		4	/* number of words per summary map	*/
+#define	EXTSPERIAG	128	/* number of disk inode extent per iag  */
+#define IMAPBLKNO	0	/* lblkno of dinomap within inode map   */
+#define SMAPSZ		4	/* number of words per summary map      */
 #define	EXTSPERSUM	32	/* number of extents per summary map entry */
 #define	L2EXTSPERSUM	5	/* l2 number of extents per summary map */
 #define	PGSPERIEXT	4	/* number of 4K pages per dinode extent */
-#define	MAXIAGS		((1<<20)-1)	/* maximum number of iags	*/
-#define	MAXAG		128	/* maximum number of allocation groups	*/
+#define	MAXIAGS		((1<<20)-1)	/* maximum number of iags       */
+#define	MAXAG		128	/* maximum number of allocation groups  */
 
-#define AMAPSIZE	512	/* bytes in the IAG allocation maps */
-#define SMAPSIZE	16	/* bytes in the IAG summary maps */
+#define AMAPSIZE      512	/* bytes in the IAG allocation maps */
+#define SMAPSIZE      16	/* bytes in the IAG summary maps */
 
 /* convert inode number to iag number */
 #define	INOTOIAG(ino)	((ino) >> L2INOSPERIAG)
@@ -60,31 +60,31 @@
  *	inode allocation group page (per 4096 inodes of an AG)
  */
 struct iag {
-	__le64 agstart;		/* 8: starting block of ag		*/
-	__le32 iagnum;		/* 4: inode allocation group number	*/
-	__le32 inofreefwd;	/* 4: ag inode free list forward	*/
-	__le32 inofreeback;	/* 4: ag inode free list back		*/
-	__le32 extfreefwd;	/* 4: ag inode extent free list forward	*/
-	__le32 extfreeback;	/* 4: ag inode extent free list back	*/
-	__le32 iagfree;		/* 4: iag free list			*/
+	__le64 agstart;		/* 8: starting block of ag              */
+	__le32 iagnum;		/* 4: inode allocation group number     */
+	__le32 inofreefwd;	/* 4: ag inode free list forward        */
+	__le32 inofreeback;	/* 4: ag inode free list back           */
+	__le32 extfreefwd;	/* 4: ag inode extent free list forward */
+	__le32 extfreeback;	/* 4: ag inode extent free list back    */
+	__le32 iagfree;		/* 4: iag free list                     */
 
 	/* summary map: 1 bit per inode extent */
 	__le32 inosmap[SMAPSZ];	/* 16: sum map of mapwords w/ free inodes;
-				 *	note: this indicates free and backed
-				 *	inodes, if the extent is not backed the
-				 *	value will be 1.  if the extent is
-				 *	backed but all inodes are being used the
-				 *	value will be 1.  if the extent is
-				 *	backed but at least one of the inodes is
-				 *	free the value will be 0.
+				 *      note: this indicates free and backed
+				 *      inodes, if the extent is not backed the
+				 *      value will be 1.  if the extent is
+				 *      backed but all inodes are being used the
+				 *      value will be 1.  if the extent is
+				 *      backed but at least one of the inodes is
+				 *      free the value will be 0.
 				 */
 	__le32 extsmap[SMAPSZ];	/* 16: sum map of mapwords w/ free extents */
-	__le32 nfreeinos;	/* 4: number of free inodes		*/
-	__le32 nfreeexts;	/* 4: number of free extents		*/
+	__le32 nfreeinos;		/* 4: number of free inodes             */
+	__le32 nfreeexts;		/* 4: number of free extents            */
 	/* (72) */
 	u8 pad[1976];		/* 1976: pad to 2048 bytes */
 	/* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */
-	__le32 wmap[EXTSPERIAG];	/* 512: working allocation map */
+	__le32 wmap[EXTSPERIAG];	/* 512: working allocation map  */
 	__le32 pmap[EXTSPERIAG];	/* 512: persistent allocation map */
 	pxd_t inoext[EXTSPERIAG];	/* 1024: inode extent addresses */
 };				/* (4096) */
@@ -93,44 +93,44 @@ struct iag {
  *	per AG control information (in inode map control page)
  */
 struct iagctl_disk {
-	__le32 inofree;		/* 4: free inode list anchor		*/
-	__le32 extfree;		/* 4: free extent list anchor		*/
-	__le32 numinos;		/* 4: number of backed inodes		*/
-	__le32 numfree;		/* 4: number of free inodes		*/
+	__le32 inofree;		/* 4: free inode list anchor            */
+	__le32 extfree;		/* 4: free extent list anchor           */
+	__le32 numinos;		/* 4: number of backed inodes           */
+	__le32 numfree;		/* 4: number of free inodes             */
 };				/* (16) */
 
 struct iagctl {
-	int inofree;		/* free inode list anchor		*/
-	int extfree;		/* free extent list anchor		*/
-	int numinos;		/* number of backed inodes		*/
-	int numfree;		/* number of free inodes		*/
+	int inofree;		/* free inode list anchor            */
+	int extfree;		/* free extent list anchor           */
+	int numinos;		/* number of backed inodes           */
+	int numfree;		/* number of free inodes             */
 };
 
 /*
  *	per fileset/aggregate inode map control page
  */
 struct dinomap_disk {
-	__le32 in_freeiag;	/* 4: free iag list anchor	*/
-	__le32 in_nextiag;	/* 4: next free iag number	*/
-	__le32 in_numinos;	/* 4: num of backed inodes	*/
+	__le32 in_freeiag;	/* 4: free iag list anchor     */
+	__le32 in_nextiag;	/* 4: next free iag number     */
+	__le32 in_numinos;	/* 4: num of backed inodes */
 	__le32 in_numfree;	/* 4: num of free backed inodes */
 	__le32 in_nbperiext;	/* 4: num of blocks per inode extent */
-	__le32 in_l2nbperiext;	/* 4: l2 of in_nbperiext	*/
-	__le32 in_diskblock;	/* 4: for standalone test driver */
-	__le32 in_maxag;	/* 4: for standalone test driver */
-	u8 pad[2016];		/* 2016: pad to 2048		*/
+	__le32 in_l2nbperiext;	/* 4: l2 of in_nbperiext */
+	__le32 in_diskblock;	/* 4: for standalone test driver  */
+	__le32 in_maxag;	/* 4: for standalone test driver  */
+	u8 pad[2016];		/* 2016: pad to 2048 */
 	struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */
 };				/* (4096) */
 
 struct dinomap {
-	int in_freeiag;		/* free iag list anchor		*/
-	int in_nextiag;		/* next free iag number		*/
-	int in_numinos;		/* num of backed inodes		*/
-	int in_numfree;		/* num of free backed inodes	*/
+	int in_freeiag;		/* free iag list anchor     */
+	int in_nextiag;		/* next free iag number     */
+	int in_numinos;		/* num of backed inodes */
+	int in_numfree;		/* num of free backed inodes */
 	int in_nbperiext;	/* num of blocks per inode extent */
-	int in_l2nbperiext;	/* l2 of in_nbperiext		*/
-	int in_diskblock;	/* for standalone test driver	*/
-	int in_maxag;		/* for standalone test driver	*/
+	int in_l2nbperiext;	/* l2 of in_nbperiext */
+	int in_diskblock;	/* for standalone test driver  */
+	int in_maxag;		/* for standalone test driver  */
 	struct iagctl in_agctl[MAXAG];	/* AG control information */
 };
 
@@ -139,9 +139,9 @@ struct dinomap {
  */
 struct inomap {
 	struct dinomap im_imap;		/* 4096: inode allocation control */
-	struct inode *im_ipimap;	/* 4: ptr to inode for imap	*/
-	struct mutex im_freelock;	/* 4: iag free list lock	*/
-	struct mutex im_aglock[MAXAG];	/* 512: per AG locks		*/
+	struct inode *im_ipimap;	/* 4: ptr to inode for imap   */
+	struct mutex im_freelock;	/* 4: iag free list lock      */
+	struct mutex im_aglock[MAXAG];	/* 512: per AG locks          */
 	u32 *im_DBGdimap;
 	atomic_t im_numinos;	/* num of backed inodes */
 	atomic_t im_numfree;	/* num of free backed inodes */
diff --git a/trunk/fs/jfs/jfs_incore.h b/trunk/fs/jfs/jfs_incore.h
index cb8f30985ad1..8f453eff3c83 100644
--- a/trunk/fs/jfs/jfs_incore.h
+++ b/trunk/fs/jfs/jfs_incore.h
@@ -40,7 +40,7 @@ struct jfs_inode_info {
 	uint	mode2;		/* jfs-specific mode		*/
 	uint	saved_uid;	/* saved for uid mount option */
 	uint	saved_gid;	/* saved for gid mount option */
-	pxd_t	ixpxd;		/* inode extent descriptor	*/
+	pxd_t   ixpxd;		/* inode extent descriptor	*/
 	dxd_t	acl;		/* dxd describing acl	*/
 	dxd_t	ea;		/* dxd describing ea	*/
 	time_t	otime;		/* time created	*/
@@ -190,7 +190,7 @@ struct jfs_sb_info {
 	uint		gengen;		/* inode generation generator*/
 	uint		inostamp;	/* shows inode belongs to fileset*/
 
-	/* Formerly in ipbmap */
+        /* Formerly in ipbmap */
 	struct bmap	*bmap;		/* incore bmap descriptor	*/
 	struct nls_table *nls_tab;	/* current codepage		*/
 	struct inode *direct_inode;	/* metadata inode */
diff --git a/trunk/fs/jfs/jfs_logmgr.c b/trunk/fs/jfs/jfs_logmgr.c
index de3e4a506dbc..44a2f33cb98d 100644
--- a/trunk/fs/jfs/jfs_logmgr.c
+++ b/trunk/fs/jfs/jfs_logmgr.c
@@ -244,7 +244,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 		goto writeRecord;
 
 	/*
-	 *	initialize/update page/transaction recovery lsn
+	 *      initialize/update page/transaction recovery lsn
 	 */
 	lsn = log->lsn;
 
@@ -263,7 +263,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	}
 
 	/*
-	 *	initialize/update lsn of tblock of the page
+	 *      initialize/update lsn of tblock of the page
 	 *
 	 * transaction inherits oldest lsn of pages associated
 	 * with allocation/deallocation of resources (their
@@ -307,7 +307,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	LOGSYNC_UNLOCK(log, flags);
 
 	/*
-	 *	write the log record
+	 *      write the log record
 	 */
       writeRecord:
 	lsn = lmWriteRecord(log, tblk, lrd, tlck);
@@ -372,7 +372,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 		goto moveLrd;
 
 	/*
-	 *	move log record data
+	 *      move log record data
 	 */
 	/* retrieve source meta-data page to log */
 	if (tlck->flag & tlckPAGELOCK) {
@@ -465,7 +465,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	}
 
 	/*
-	 *	move log record descriptor
+	 *      move log record descriptor
 	 */
       moveLrd:
 	lrd->length = cpu_to_le16(len);
@@ -574,7 +574,7 @@ static int lmNextPage(struct jfs_log * log)
 	LOGGC_LOCK(log);
 
 	/*
-	 *	write or queue the full page at the tail of write queue
+	 *      write or queue the full page at the tail of write queue
 	 */
 	/* get the tail tblk on commit queue */
 	if (list_empty(&log->cqueue))
@@ -625,7 +625,7 @@ static int lmNextPage(struct jfs_log * log)
 	LOGGC_UNLOCK(log);
 
 	/*
-	 *	allocate/initialize next page
+	 *      allocate/initialize next page
 	 */
 	/* if log wraps, the first data page of log is 2
 	 * (0 never used, 1 is superblock).
@@ -953,7 +953,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
 		}
 
 	/*
-	 *	forward syncpt
+	 *      forward syncpt
 	 */
 	/* if last sync is same as last syncpt,
 	 * invoke sync point forward processing to update sync.
@@ -989,7 +989,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
 		lsn = log->lsn;
 
 	/*
-	 *	setup next syncpt trigger (SWAG)
+	 *      setup next syncpt trigger (SWAG)
 	 */
 	logsize = log->logsize;
 
@@ -1000,11 +1000,11 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
 	if (more < 2 * LOGPSIZE) {
 		jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
 		/*
-		 *	log wrapping
+		 *      log wrapping
 		 *
 		 * option 1 - panic ? No.!
 		 * option 2 - shutdown file systems
-		 *	      associated with log ?
+		 *            associated with log ?
 		 * option 3 - extend log ?
 		 */
 		/*
@@ -1062,7 +1062,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
 /*
  * NAME:	lmLogOpen()
  *
- * FUNCTION:	open the log on first open;
+ * FUNCTION:    open the log on first open;
  *	insert filesystem in the active list of the log.
  *
  * PARAMETER:	ipmnt	- file system mount inode
@@ -1113,7 +1113,7 @@ int lmLogOpen(struct super_block *sb)
 	init_waitqueue_head(&log->syncwait);
 
 	/*
-	 *	external log as separate logical volume
+	 *      external log as separate logical volume
 	 *
 	 * file systems to log may have n-to-1 relationship;
 	 */
@@ -1155,7 +1155,7 @@ int lmLogOpen(struct super_block *sb)
 	return 0;
 
 	/*
-	 *	unwind on error
+	 *      unwind on error
 	 */
       shutdown:		/* unwind lbmLogInit() */
 	list_del(&log->journal_list);
@@ -1427,7 +1427,7 @@ int lmLogInit(struct jfs_log * log)
 	return 0;
 
 	/*
-	 *	unwind on error
+	 *      unwind on error
 	 */
       errout30:		/* release log page */
 	log->wqueue = NULL;
@@ -1480,7 +1480,7 @@ int lmLogClose(struct super_block *sb)
 
 	if (test_bit(log_INLINELOG, &log->flag)) {
 		/*
-		 *	in-line log in host file system
+		 *      in-line log in host file system
 		 */
 		rc = lmLogShutdown(log);
 		kfree(log);
@@ -1504,7 +1504,7 @@ int lmLogClose(struct super_block *sb)
 		goto out;
 
 	/*
-	 *	external log as separate logical volume
+	 *      external log as separate logical volume
 	 */
 	list_del(&log->journal_list);
 	bdev = log->bdev;
@@ -1622,26 +1622,20 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
 	if (!list_empty(&log->synclist)) {
 		struct logsyncblk *lp;
 
-		printk(KERN_ERR "jfs_flush_journal: synclist not empty\n");
 		list_for_each_entry(lp, &log->synclist, synclist) {
 			if (lp->xflag & COMMIT_PAGE) {
 				struct metapage *mp = (struct metapage *)lp;
-				print_hex_dump(KERN_ERR, "metapage: ",
-					       DUMP_PREFIX_ADDRESS, 16, 4,
-					       mp, sizeof(struct metapage), 0);
-				print_hex_dump(KERN_ERR, "page: ",
-					       DUMP_PREFIX_ADDRESS, 16,
-					       sizeof(long), mp->page,
-					       sizeof(struct page), 0);
-			} else
-				print_hex_dump(KERN_ERR, "tblock:",
-					       DUMP_PREFIX_ADDRESS, 16, 4,
-					       lp, sizeof(struct tblock), 0);
+				dump_mem("orphan metapage", lp,
+					 sizeof(struct metapage));
+				dump_mem("page", mp->page, sizeof(struct page));
+			}
+			else
+				dump_mem("orphan tblock", lp,
+					 sizeof(struct tblock));
 		}
 	}
-#else
-	WARN_ON(!list_empty(&log->synclist));
 #endif
+	//assert(list_empty(&log->synclist));
 	clear_bit(log_FLUSH, &log->flag);
 }
 
@@ -1729,7 +1723,7 @@ int lmLogShutdown(struct jfs_log * log)
  *
  * PARAMETE:	log	- pointer to logs inode.
  *		fsdev	- kdev_t of filesystem.
- *		serial	- pointer to returned log serial number
+ *		serial  - pointer to returned log serial number
  *		activate - insert/remove device from active list.
  *
  * RETURN:	0	- success
@@ -1969,7 +1963,7 @@ static void lbmfree(struct lbuf * bp)
  * FUNCTION:	add a log buffer to the log redrive list
  *
  * PARAMETER:
- *	bp	- log buffer
+ *     bp	- log buffer
  *
  * NOTES:
  *	Takes log_redrive_lock.
@@ -2060,7 +2054,7 @@ static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
 	bp->l_flag = flag;
 
 	/*
-	 *	insert bp at tail of write queue associated with log
+	 *      insert bp at tail of write queue associated with log
 	 *
 	 * (request is either for bp already/currently at head of queue
 	 * or new bp to be inserted at tail)
@@ -2123,7 +2117,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
 	    log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
 
 	/*
-	 *	initiate pageout of the page
+	 *      initiate pageout of the page
 	 */
 	lbmStartIO(bp);
 }
@@ -2134,7 +2128,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
  *
  * FUNCTION:	Interface to DD strategy routine
  *
- * RETURN:	none
+ * RETURN:      none
  *
  * serialization: LCACHE_LOCK() is NOT held during log i/o;
  */
@@ -2228,7 +2222,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
 	bio_put(bio);
 
 	/*
-	 *	pagein completion
+	 *      pagein completion
 	 */
 	if (bp->l_flag & lbmREAD) {
 		bp->l_flag &= ~lbmREAD;
@@ -2242,7 +2236,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
 	}
 
 	/*
-	 *	pageout completion
+	 *      pageout completion
 	 *
 	 * the bp at the head of write queue has completed pageout.
 	 *
@@ -2308,7 +2302,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
 	}
 
 	/*
-	 *	synchronous pageout:
+	 *      synchronous pageout:
 	 *
 	 * buffer has not necessarily been removed from write queue
 	 * (e.g., synchronous write of partial-page with COMMIT):
@@ -2322,7 +2316,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
 	}
 
 	/*
-	 *	Group Commit pageout:
+	 *      Group Commit pageout:
 	 */
 	else if (bp->l_flag & lbmGC) {
 		LCACHE_UNLOCK(flags);
@@ -2330,7 +2324,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
 	}
 
 	/*
-	 *	asynchronous pageout:
+	 *      asynchronous pageout:
 	 *
 	 * buffer must have been removed from write queue:
 	 * insert buffer at head of freelist where it can be recycled
@@ -2381,7 +2375,7 @@ int jfsIOWait(void *arg)
  * FUNCTION:	format file system log
  *
  * PARAMETERS:
- *	log	- volume log
+ *      log	- volume log
  *	logAddress - start address of log space in FS block
  *	logSize	- length of log space in FS block;
  *
@@ -2413,16 +2407,16 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
 	npages = logSize >> sbi->l2nbperpage;
 
 	/*
-	 *	log space:
+	 *      log space:
 	 *
 	 * page 0 - reserved;
 	 * page 1 - log superblock;
 	 * page 2 - log data page: A SYNC log record is written
-	 *	    into this page at logform time;
+	 *          into this page at logform time;
 	 * pages 3-N - log data page: set to empty log data pages;
 	 */
 	/*
-	 *	init log superblock: log page 1
+	 *      init log superblock: log page 1
 	 */
 	logsuper = (struct logsuper *) bp->l_ldata;
 
@@ -2442,7 +2436,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
 		goto exit;
 
 	/*
-	 *	init pages 2 to npages-1 as log data pages:
+	 *      init pages 2 to npages-1 as log data pages:
 	 *
 	 * log page sequence number (lpsn) initialization:
 	 *
@@ -2485,7 +2479,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
 		goto exit;
 
 	/*
-	 *	initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
+	 *      initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
 	 */
 	for (lspn = 0; lspn < npages - 3; lspn++) {
 		lp->h.page = lp->t.page = cpu_to_le32(lspn);
@@ -2501,7 +2495,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
 	rc = 0;
 exit:
 	/*
-	 *	finalize log
+	 *      finalize log
 	 */
 	/* release the buffer */
 	lbmFree(bp);
diff --git a/trunk/fs/jfs/jfs_logmgr.h b/trunk/fs/jfs/jfs_logmgr.h
index 1f85ef0ec045..a53fb17ea219 100644
--- a/trunk/fs/jfs/jfs_logmgr.h
+++ b/trunk/fs/jfs/jfs_logmgr.h
@@ -144,7 +144,7 @@ struct logpage {
  *
  * (this comment should be rewritten !)
  * jfs uses only "after" log records (only a single writer is allowed
- * in a page, pages are written to temporary paging space if
+ * in a  page, pages are written to temporary paging space if
  * if they must be written to disk before commit, and i/o is
  * scheduled for modified pages to their home location after
  * the log records containing the after values and the commit
@@ -153,7 +153,7 @@ struct logpage {
  *
  * a log record consists of a data area of variable length followed by
  * a descriptor of fixed size LOGRDSIZE bytes.
- * the data area is rounded up to an integral number of 4-bytes and
+ * the  data area is rounded up to an integral number of 4-bytes and
  * must be no longer than LOGPSIZE.
  * the descriptor is of size of multiple of 4-bytes and aligned on a
  * 4-byte boundary.
@@ -215,13 +215,13 @@ struct lrd {
 	union {
 
 		/*
-		 *	COMMIT: commit
+		 *      COMMIT: commit
 		 *
 		 * transaction commit: no type-dependent information;
 		 */
 
 		/*
-		 *	REDOPAGE: after-image
+		 *      REDOPAGE: after-image
 		 *
 		 * apply after-image;
 		 *
@@ -236,7 +236,7 @@ struct lrd {
 		} redopage;	/* (20) */
 
 		/*
-		 *	NOREDOPAGE: the page is freed
+		 *      NOREDOPAGE: the page is freed
 		 *
 		 * do not apply after-image records which precede this record
 		 * in the log with the same page block number to this page.
@@ -252,7 +252,7 @@ struct lrd {
 		} noredopage;	/* (20) */
 
 		/*
-		 *	UPDATEMAP: update block allocation map
+		 *      UPDATEMAP: update block allocation map
 		 *
 		 * either in-line PXD,
 		 * or     out-of-line  XADLIST;
@@ -268,7 +268,7 @@ struct lrd {
 		} updatemap;	/* (20) */
 
 		/*
-		 *	NOREDOINOEXT: the inode extent is freed
+		 *      NOREDOINOEXT: the inode extent is freed
 		 *
 		 * do not apply after-image records which precede this
 		 * record in the log with the any of the 4 page block
@@ -286,7 +286,7 @@ struct lrd {
 		} noredoinoext;	/* (20) */
 
 		/*
-		 *	SYNCPT: log sync point
+		 *      SYNCPT: log sync point
 		 *
 		 * replay log upto syncpt address specified;
 		 */
@@ -295,13 +295,13 @@ struct lrd {
 		} syncpt;
 
 		/*
-		 *	MOUNT: file system mount
+		 *      MOUNT: file system mount
 		 *
 		 * file system mount: no type-dependent information;
 		 */
 
 		/*
-		 *	? FREEXTENT: free specified extent(s)
+		 *      ? FREEXTENT: free specified extent(s)
 		 *
 		 * free specified extent(s) from block allocation map
 		 * N.B.: nextents should be length of data/sizeof(xad_t)
@@ -314,7 +314,7 @@ struct lrd {
 		} freextent;
 
 		/*
-		 *	? NOREDOFILE: this file is freed
+		 *      ? NOREDOFILE: this file is freed
 		 *
 		 * do not apply records which precede this record in the log
 		 * with the same inode number.
@@ -330,7 +330,7 @@ struct lrd {
 		} noredofile;
 
 		/*
-		 *	? NEWPAGE:
+		 *      ? NEWPAGE:
 		 *
 		 * metadata type dependent
 		 */
@@ -342,7 +342,7 @@ struct lrd {
 		} newpage;
 
 		/*
-		 *	? DUMMY: filler
+		 *      ? DUMMY: filler
 		 *
 		 * no type-dependent information
 		 */
diff --git a/trunk/fs/jfs/jfs_metapage.c b/trunk/fs/jfs/jfs_metapage.c
index 77c7f1129dde..43d4f69afbec 100644
--- a/trunk/fs/jfs/jfs_metapage.c
+++ b/trunk/fs/jfs/jfs_metapage.c
@@ -472,8 +472,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 	printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n");
 	goto skip;
 dump_bio:
-	print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16,
-		       4, bio, sizeof(*bio), 0);
+	dump_mem("bio", bio, sizeof(*bio));
 skip:
 	bio_put(bio);
 	unlock_page(page);
diff --git a/trunk/fs/jfs/jfs_mount.c b/trunk/fs/jfs/jfs_mount.c
index 644429acb8c0..4dd479834897 100644
--- a/trunk/fs/jfs/jfs_mount.c
+++ b/trunk/fs/jfs/jfs_mount.c
@@ -80,7 +80,7 @@ static int logMOUNT(struct super_block *sb);
  */
 int jfs_mount(struct super_block *sb)
 {
-	int rc = 0;		/* Return code */
+	int rc = 0;		/* Return code          */
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct inode *ipaimap = NULL;
 	struct inode *ipaimap2 = NULL;
@@ -169,7 +169,7 @@ int jfs_mount(struct super_block *sb)
 		sbi->ipaimap2 = NULL;
 
 	/*
-	 *	mount (the only/single) fileset
+	 *      mount (the only/single) fileset
 	 */
 	/*
 	 * open fileset inode allocation map (aka fileset inode)
@@ -195,7 +195,7 @@ int jfs_mount(struct super_block *sb)
 	goto out;
 
 	/*
-	 *	unwind on error
+	 *      unwind on error
 	 */
       errout41:		/* close fileset inode allocation map inode */
 	diFreeSpecial(ipimap);
diff --git a/trunk/fs/jfs/jfs_txnmgr.c b/trunk/fs/jfs/jfs_txnmgr.c
index 7aa1f7004eaf..25430d0b0d59 100644
--- a/trunk/fs/jfs/jfs_txnmgr.c
+++ b/trunk/fs/jfs/jfs_txnmgr.c
@@ -18,7 +18,7 @@
  */
 
 /*
- *	jfs_txnmgr.c: transaction manager
+ *      jfs_txnmgr.c: transaction manager
  *
  * notes:
  * transaction starts with txBegin() and ends with txCommit()
@@ -60,7 +60,7 @@
 #include "jfs_debug.h"
 
 /*
- *	transaction management structures
+ *      transaction management structures
  */
 static struct {
 	int freetid;		/* index of a free tid structure */
@@ -103,19 +103,19 @@ module_param(nTxLock, int, 0);
 MODULE_PARM_DESC(nTxLock,
 		 "Number of transaction locks (max:65536)");
 
-struct tblock *TxBlock;	/* transaction block table */
-static int TxLockLWM;	/* Low water mark for number of txLocks used */
-static int TxLockHWM;	/* High water mark for number of txLocks used */
-static int TxLockVHWM;	/* Very High water mark */
-struct tlock *TxLock;	/* transaction lock table */
+struct tblock *TxBlock;	        /* transaction block table */
+static int TxLockLWM;		/* Low water mark for number of txLocks used */
+static int TxLockHWM;		/* High water mark for number of txLocks used */
+static int TxLockVHWM;		/* Very High water mark */
+struct tlock *TxLock;           /* transaction lock table */
 
 /*
- *	transaction management lock
+ *      transaction management lock
  */
 static DEFINE_SPINLOCK(jfsTxnLock);
 
-#define TXN_LOCK()		spin_lock(&jfsTxnLock)
-#define TXN_UNLOCK()		spin_unlock(&jfsTxnLock)
+#define TXN_LOCK()              spin_lock(&jfsTxnLock)
+#define TXN_UNLOCK()            spin_unlock(&jfsTxnLock)
 
 #define LAZY_LOCK_INIT()	spin_lock_init(&TxAnchor.LazyLock);
 #define LAZY_LOCK(flags)	spin_lock_irqsave(&TxAnchor.LazyLock, flags)
@@ -148,7 +148,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
 #define TXN_WAKEUP(event) wake_up_all(event)
 
 /*
- *	statistics
+ *      statistics
  */
 static struct {
 	tid_t maxtid;		/* 4: biggest tid ever used */
@@ -181,8 +181,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 static void LogSyncRelease(struct metapage * mp);
 
 /*
- *		transaction block/lock management
- *		---------------------------------
+ *              transaction block/lock management
+ *              ---------------------------------
  */
 
 /*
@@ -227,9 +227,9 @@ static void txLockFree(lid_t lid)
 }
 
 /*
- * NAME:	txInit()
+ * NAME:        txInit()
  *
- * FUNCTION:	initialize transaction management structures
+ * FUNCTION:    initialize transaction management structures
  *
  * RETURN:
  *
@@ -333,9 +333,9 @@ int txInit(void)
 }
 
 /*
- * NAME:	txExit()
+ * NAME:        txExit()
  *
- * FUNCTION:	clean up when module is unloaded
+ * FUNCTION:    clean up when module is unloaded
  */
 void txExit(void)
 {
@@ -346,12 +346,12 @@ void txExit(void)
 }
 
 /*
- * NAME:	txBegin()
+ * NAME:        txBegin()
  *
- * FUNCTION:	start a transaction.
+ * FUNCTION:    start a transaction.
  *
- * PARAMETER:	sb	- superblock
- *		flag	- force for nested tx;
+ * PARAMETER:   sb	- superblock
+ *              flag	- force for nested tx;
  *
  * RETURN:	tid	- transaction id
  *
@@ -447,13 +447,13 @@ tid_t txBegin(struct super_block *sb, int flag)
 }
 
 /*
- * NAME:	txBeginAnon()
+ * NAME:        txBeginAnon()
  *
- * FUNCTION:	start an anonymous transaction.
+ * FUNCTION:    start an anonymous transaction.
  *		Blocks if logsync or available tlocks are low to prevent
  *		anonymous tlocks from depleting supply.
  *
- * PARAMETER:	sb	- superblock
+ * PARAMETER:   sb	- superblock
  *
  * RETURN:	none
  */
@@ -489,11 +489,11 @@ void txBeginAnon(struct super_block *sb)
 }
 
 /*
- *	txEnd()
+ *      txEnd()
  *
  * function: free specified transaction block.
  *
- *	logsync barrier processing:
+ *      logsync barrier processing:
  *
  * serialization:
  */
@@ -577,13 +577,13 @@ void txEnd(tid_t tid)
 }
 
 /*
- *	txLock()
+ *      txLock()
  *
  * function: acquire a transaction lock on the specified <mp>
  *
  * parameter:
  *
- * return:	transaction lock id
+ * return:      transaction lock id
  *
  * serialization:
  */
@@ -829,16 +829,12 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
 	/* Only locks on ipimap or ipaimap should reach here */
 	/* assert(jfs_ip->fileset == AGGREGATE_I); */
 	if (jfs_ip->fileset != AGGREGATE_I) {
-		printk(KERN_ERR "txLock: trying to lock locked page!");
-		print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4,
-			       ip, sizeof(*ip), 0);
-		print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4,
-			       mp, sizeof(*mp), 0);
-		print_hex_dump(KERN_ERR, "Locker's tblock: ",
-			       DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid),
-			       sizeof(struct tblock), 0);
-		print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4,
-			       tlck, sizeof(*tlck), 0);
+		jfs_err("txLock: trying to lock locked page!");
+		dump_mem("ip", ip, sizeof(struct inode));
+		dump_mem("mp", mp, sizeof(struct metapage));
+		dump_mem("Locker's tblk", tid_to_tblock(tid),
+			 sizeof(struct tblock));
+		dump_mem("Tlock", tlck, sizeof(struct tlock));
 		BUG();
 	}
 	INCREMENT(stattx.waitlock);	/* statistics */
@@ -861,17 +857,17 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
 }
 
 /*
- * NAME:	txRelease()
+ * NAME:        txRelease()
  *
- * FUNCTION:	Release buffers associated with transaction locks, but don't
+ * FUNCTION:    Release buffers associated with transaction locks, but don't
  *		mark homeok yet.  The allows other transactions to modify
  *		buffers, but won't let them go to disk until commit record
  *		actually gets written.
  *
  * PARAMETER:
- *		tblk	-
+ *              tblk    -
  *
- * RETURN:	Errors from subroutines.
+ * RETURN:      Errors from subroutines.
  */
 static void txRelease(struct tblock * tblk)
 {
@@ -900,10 +896,10 @@ static void txRelease(struct tblock * tblk)
 }
 
 /*
- * NAME:	txUnlock()
+ * NAME:        txUnlock()
  *
- * FUNCTION:	Initiates pageout of pages modified by tid in journalled
- *		objects and frees their lockwords.
+ * FUNCTION:    Initiates pageout of pages modified by tid in journalled
+ *              objects and frees their lockwords.
  */
 static void txUnlock(struct tblock * tblk)
 {
@@ -987,10 +983,10 @@ static void txUnlock(struct tblock * tblk)
 }
 
 /*
- *	txMaplock()
+ *      txMaplock()
  *
  * function: allocate a transaction lock for freed page/entry;
- *	for freed page, maplock is used as xtlock/dtlock type;
+ *      for freed page, maplock is used as xtlock/dtlock type;
  */
 struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
 {
@@ -1061,7 +1057,7 @@ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
 }
 
 /*
- *	txLinelock()
+ *      txLinelock()
  *
  * function: allocate a transaction lock for log vector list
  */
@@ -1096,39 +1092,39 @@ struct linelock *txLinelock(struct linelock * tlock)
 }
 
 /*
- *		transaction commit management
- *		-----------------------------
+ *              transaction commit management
+ *              -----------------------------
  */
 
 /*
- * NAME:	txCommit()
- *
- * FUNCTION:	commit the changes to the objects specified in
- *		clist.  For journalled segments only the
- *		changes of the caller are committed, ie by tid.
- *		for non-journalled segments the data are flushed to
- *		disk and then the change to the disk inode and indirect
- *		blocks committed (so blocks newly allocated to the
- *		segment will be made a part of the segment atomically).
- *
- *		all of the segments specified in clist must be in
- *		one file system. no more than 6 segments are needed
- *		to handle all unix svcs.
- *
- *		if the i_nlink field (i.e. disk inode link count)
- *		is zero, and the type of inode is a regular file or
- *		directory, or symbolic link , the inode is truncated
- *		to zero length. the truncation is committed but the
- *		VM resources are unaffected until it is closed (see
- *		iput and iclose).
+ * NAME:        txCommit()
+ *
+ * FUNCTION:    commit the changes to the objects specified in
+ *              clist.  For journalled segments only the
+ *              changes of the caller are committed, ie by tid.
+ *              for non-journalled segments the data are flushed to
+ *              disk and then the change to the disk inode and indirect
+ *              blocks committed (so blocks newly allocated to the
+ *              segment will be made a part of the segment atomically).
+ *
+ *              all of the segments specified in clist must be in
+ *              one file system. no more than 6 segments are needed
+ *              to handle all unix svcs.
+ *
+ *              if the i_nlink field (i.e. disk inode link count)
+ *              is zero, and the type of inode is a regular file or
+ *              directory, or symbolic link , the inode is truncated
+ *              to zero length. the truncation is committed but the
+ *              VM resources are unaffected until it is closed (see
+ *              iput and iclose).
  *
  * PARAMETER:
  *
  * RETURN:
  *
  * serialization:
- *		on entry the inode lock on each segment is assumed
- *		to be held.
+ *              on entry the inode lock on each segment is assumed
+ *              to be held.
  *
  * i/o error:
  */
@@ -1179,7 +1175,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
 	if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
 		tblk->xflag |= COMMIT_LAZY;
 	/*
-	 *	prepare non-journaled objects for commit
+	 *      prepare non-journaled objects for commit
 	 *
 	 * flush data pages of non-journaled file
 	 * to prevent the file getting non-initialized disk blocks
@@ -1190,7 +1186,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
 	cd.nip = nip;
 
 	/*
-	 *	acquire transaction lock on (on-disk) inodes
+	 *      acquire transaction lock on (on-disk) inodes
 	 *
 	 * update on-disk inode from in-memory inode
 	 * acquiring transaction locks for AFTER records
@@ -1266,7 +1262,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
 	}
 
 	/*
-	 *	write log records from transaction locks
+	 *      write log records from transaction locks
 	 *
 	 * txUpdateMap() resets XAD_NEW in XAD.
 	 */
@@ -1298,7 +1294,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
 		!test_cflag(COMMIT_Nolink, tblk->u.ip)));
 
 	/*
-	 *	write COMMIT log record
+	 *      write COMMIT log record
 	 */
 	lrd->type = cpu_to_le16(LOG_COMMIT);
 	lrd->length = 0;
@@ -1307,7 +1303,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
 	lmGroupCommit(log, tblk);
 
 	/*
-	 *	- transaction is now committed -
+	 *      - transaction is now committed -
 	 */
 
 	/*
@@ -1318,11 +1314,11 @@ int txCommit(tid_t tid,		/* transaction identifier */
 		txForce(tblk);
 
 	/*
-	 *	update allocation map.
+	 *      update allocation map.
 	 *
 	 * update inode allocation map and inode:
 	 * free pager lock on memory object of inode if any.
-	 * update block allocation map.
+	 * update  block allocation map.
 	 *
 	 * txUpdateMap() resets XAD_NEW in XAD.
 	 */
@@ -1330,7 +1326,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
 		txUpdateMap(tblk);
 
 	/*
-	 *	free transaction locks and pageout/free pages
+	 *      free transaction locks and pageout/free pages
 	 */
 	txRelease(tblk);
 
@@ -1339,7 +1335,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
 
 
 	/*
-	 *	reset in-memory object state
+	 *      reset in-memory object state
 	 */
 	for (k = 0; k < cd.nip; k++) {
 		ip = cd.iplist[k];
@@ -1362,11 +1358,11 @@ int txCommit(tid_t tid,		/* transaction identifier */
 }
 
 /*
- * NAME:	txLog()
+ * NAME:        txLog()
  *
- * FUNCTION:	Writes AFTER log records for all lines modified
- *		by tid for segments specified by inodes in comdata.
- *		Code assumes only WRITELOCKS are recorded in lockwords.
+ * FUNCTION:    Writes AFTER log records for all lines modified
+ *              by tid for segments specified by inodes in comdata.
+ *              Code assumes only WRITELOCKS are recorded in lockwords.
  *
  * PARAMETERS:
  *
@@ -1425,12 +1421,12 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
 }
 
 /*
- *	diLog()
+ *      diLog()
  *
- * function:	log inode tlock and format maplock to update bmap;
+ * function:    log inode tlock and format maplock to update bmap;
  */
 static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
-		 struct tlock * tlck, struct commit * cd)
+	  struct tlock * tlck, struct commit * cd)
 {
 	int rc = 0;
 	struct metapage *mp;
@@ -1446,7 +1442,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	pxd = &lrd->log.redopage.pxd;
 
 	/*
-	 *	inode after image
+	 *      inode after image
 	 */
 	if (tlck->type & tlckENTRY) {
 		/* log after-image for logredo(): */
@@ -1460,7 +1456,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 		tlck->flag |= tlckWRITEPAGE;
 	} else if (tlck->type & tlckFREE) {
 		/*
-		 *	free inode extent
+		 *      free inode extent
 		 *
 		 * (pages of the freed inode extent have been invalidated and
 		 * a maplock for free of the extent has been formatted at
@@ -1502,7 +1498,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 		jfs_err("diLog: UFO type tlck:0x%p", tlck);
 #ifdef  _JFS_WIP
 	/*
-	 *	alloc/free external EA extent
+	 *      alloc/free external EA extent
 	 *
 	 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
 	 * of the extent has been formatted at txLock() time;
@@ -1538,9 +1534,9 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 
 /*
- *	dataLog()
+ *      dataLog()
  *
- * function:	log data tlock
+ * function:    log data tlock
  */
 static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	    struct tlock * tlck)
@@ -1584,9 +1580,9 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 
 /*
- *	dtLog()
+ *      dtLog()
  *
- * function:	log dtree tlock and format maplock to update bmap;
+ * function:    log dtree tlock and format maplock to update bmap;
  */
 static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	   struct tlock * tlck)
@@ -1607,10 +1603,10 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
 
 	/*
-	 *	page extension via relocation: entry insertion;
-	 *	page extension in-place: entry insertion;
-	 *	new right page from page split, reinitialized in-line
-	 *	root from root page split: entry insertion;
+	 *      page extension via relocation: entry insertion;
+	 *      page extension in-place: entry insertion;
+	 *      new right page from page split, reinitialized in-line
+	 *      root from root page split: entry insertion;
 	 */
 	if (tlck->type & (tlckNEW | tlckEXTEND)) {
 		/* log after-image of the new page for logredo():
@@ -1645,8 +1641,8 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	}
 
 	/*
-	 *	entry insertion/deletion,
-	 *	sibling page link update (old right page before split);
+	 *      entry insertion/deletion,
+	 *      sibling page link update (old right page before split);
 	 */
 	if (tlck->type & (tlckENTRY | tlckRELINK)) {
 		/* log after-image for logredo(): */
@@ -1662,11 +1658,11 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	}
 
 	/*
-	 *	page deletion: page has been invalidated
-	 *	page relocation: source extent
+	 *      page deletion: page has been invalidated
+	 *      page relocation: source extent
 	 *
-	 *	a maplock for free of the page has been formatted
-	 *	at txLock() time);
+	 *      a maplock for free of the page has been formatted
+	 *      at txLock() time);
 	 */
 	if (tlck->type & (tlckFREE | tlckRELOCATE)) {
 		/* log LOG_NOREDOPAGE of the deleted page for logredo()
@@ -1687,9 +1683,9 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 
 /*
- *	xtLog()
+ *      xtLog()
  *
- * function:	log xtree tlock and format maplock to update bmap;
+ * function:    log xtree tlock and format maplock to update bmap;
  */
 static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	   struct tlock * tlck)
@@ -1729,8 +1725,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	xadlock = (struct xdlistlock *) maplock;
 
 	/*
-	 *	entry insertion/extension;
-	 *	sibling page link update (old right page before split);
+	 *      entry insertion/extension;
+	 *      sibling page link update (old right page before split);
 	 */
 	if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
 		/* log after-image for logredo():
@@ -1805,7 +1801,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	}
 
 	/*
-	 *	page deletion: file deletion/truncation (ref. xtTruncate())
+	 *      page deletion: file deletion/truncation (ref. xtTruncate())
 	 *
 	 * (page will be invalidated after log is written and bmap
 	 * is updated from the page);
@@ -1912,13 +1908,13 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	}
 
 	/*
-	 *	page/entry truncation: file truncation (ref. xtTruncate())
+	 *      page/entry truncation: file truncation (ref. xtTruncate())
 	 *
-	 *	|----------+------+------+---------------|
-	 *		   |      |      |
-	 *		   |      |     hwm - hwm before truncation
-	 *		   |     next - truncation point
-	 *		  lwm - lwm before truncation
+	 *     |----------+------+------+---------------|
+	 *                |      |      |
+	 *                |      |     hwm - hwm before truncation
+	 *                |     next - truncation point
+	 *               lwm - lwm before truncation
 	 * header ?
 	 */
 	if (tlck->type & tlckTRUNCATE) {
@@ -1941,7 +1937,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 		twm = xtlck->twm.offset;
 
 		/*
-		 *	write log records
+		 *      write log records
 		 */
 		/* log after-image for logredo():
 		 *
@@ -2001,7 +1997,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 		}
 
 		/*
-		 *	format maplock(s) for txUpdateMap() to update bmap
+		 *      format maplock(s) for txUpdateMap() to update bmap
 		 */
 		maplock->index = 0;
 
@@ -2073,9 +2069,9 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 
 /*
- *	mapLog()
+ *      mapLog()
  *
- * function:	log from maplock of freed data extents;
+ * function:    log from maplock of freed data extents;
  */
 static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 		   struct tlock * tlck)
@@ -2085,7 +2081,7 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 	pxd_t *pxd;
 
 	/*
-	 *	page relocation: free the source page extent
+	 *      page relocation: free the source page extent
 	 *
 	 * a maplock for txUpdateMap() for free of the page
 	 * has been formatted at txLock() time saving the src
@@ -2159,10 +2155,10 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 
 /*
- *	txEA()
+ *      txEA()
  *
- * function:	acquire maplock for EA/ACL extents or
- *		set COMMIT_INLINE flag;
+ * function:    acquire maplock for EA/ACL extents or
+ *              set COMMIT_INLINE flag;
  */
 void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
 {
@@ -2211,10 +2207,10 @@ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
 }
 
 /*
- *	txForce()
+ *      txForce()
  *
  * function: synchronously write pages locked by transaction
- *	     after txLog() but before txUpdateMap();
+ *              after txLog() but before txUpdateMap();
  */
 static void txForce(struct tblock * tblk)
 {
@@ -2277,10 +2273,10 @@ static void txForce(struct tblock * tblk)
 }
 
 /*
- *	txUpdateMap()
+ *      txUpdateMap()
  *
- * function:	update persistent allocation map (and working map
- *		if appropriate);
+ * function:    update persistent allocation map (and working map
+ *              if appropriate);
  *
  * parameter:
  */
@@ -2302,7 +2298,7 @@ static void txUpdateMap(struct tblock * tblk)
 
 
 	/*
-	 *	update block allocation map
+	 *      update block allocation map
 	 *
 	 * update allocation state in pmap (and wmap) and
 	 * update lsn of the pmap page;
@@ -2386,7 +2382,7 @@ static void txUpdateMap(struct tblock * tblk)
 		}
 	}
 	/*
-	 *	update inode allocation map
+	 *      update inode allocation map
 	 *
 	 * update allocation state in pmap and
 	 * update lsn of the pmap page;
@@ -2411,24 +2407,24 @@ static void txUpdateMap(struct tblock * tblk)
 }
 
 /*
- *	txAllocPMap()
+ *      txAllocPMap()
  *
  * function: allocate from persistent map;
  *
  * parameter:
- *	ipbmap	-
- *	malock	-
- *		xad list:
- *		pxd:
- *
- *	maptype -
- *		allocate from persistent map;
- *		free from persistent map;
- *		(e.g., tmp file - free from working map at releae
- *		 of last reference);
- *		free from persistent and working map;
- *
- *	lsn	- log sequence number;
+ *      ipbmap  -
+ *      malock -
+ *              xad list:
+ *              pxd:
+ *
+ *      maptype -
+ *              allocate from persistent map;
+ *              free from persistent map;
+ *              (e.g., tmp file - free from working map at releae
+ *               of last reference);
+ *              free from persistent and working map;
+ *
+ *      lsn     - log sequence number;
  */
 static void txAllocPMap(struct inode *ip, struct maplock * maplock,
 			struct tblock * tblk)
@@ -2482,9 +2478,9 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock,
 }
 
 /*
- *	txFreeMap()
+ *      txFreeMap()
  *
- * function:	free from persistent and/or working map;
+ * function:    free from persistent and/or working map;
  *
  * todo: optimization
  */
@@ -2583,9 +2579,9 @@ void txFreeMap(struct inode *ip,
 }
 
 /*
- *	txFreelock()
+ *      txFreelock()
  *
- * function:	remove tlock from inode anonymous locklist
+ * function:    remove tlock from inode anonymous locklist
  */
 void txFreelock(struct inode *ip)
 {
@@ -2623,7 +2619,7 @@ void txFreelock(struct inode *ip)
 }
 
 /*
- *	txAbort()
+ *      txAbort()
  *
  * function: abort tx before commit;
  *
@@ -2683,7 +2679,7 @@ void txAbort(tid_t tid, int dirty)
 }
 
 /*
- *	txLazyCommit(void)
+ *      txLazyCommit(void)
  *
  *	All transactions except those changing ipimap (COMMIT_FORCE) are
  *	processed by this routine.  This insures that the inode and block
@@ -2732,7 +2728,7 @@ static void txLazyCommit(struct tblock * tblk)
 }
 
 /*
- *	jfs_lazycommit(void)
+ *      jfs_lazycommit(void)
  *
  *	To be run as a kernel daemon.  If lbmIODone is called in an interrupt
  *	context, or where blocking is not wanted, this routine will process
@@ -2917,7 +2913,7 @@ void txResume(struct super_block *sb)
 }
 
 /*
- *	jfs_sync(void)
+ *      jfs_sync(void)
  *
  *	To be run as a kernel daemon.  This is awakened when tlocks run low.
  *	We write any inodes that have anonymous tlocks so they will become
diff --git a/trunk/fs/jfs/jfs_txnmgr.h b/trunk/fs/jfs/jfs_txnmgr.h
index ab7288937019..7863cf21afca 100644
--- a/trunk/fs/jfs/jfs_txnmgr.h
+++ b/trunk/fs/jfs/jfs_txnmgr.h
@@ -94,7 +94,7 @@ extern struct tblock *TxBlock;	/* transaction block table */
  */
 struct tlock {
 	lid_t next;		/* 2: index next lockword on tid locklist
-				 *	    next lockword on freelist
+				 *          next lockword on freelist
 				 */
 	tid_t tid;		/* 2: transaction id holding lock */
 
diff --git a/trunk/fs/jfs/jfs_types.h b/trunk/fs/jfs/jfs_types.h
index 649f9817accd..09b252958687 100644
--- a/trunk/fs/jfs/jfs_types.h
+++ b/trunk/fs/jfs/jfs_types.h
@@ -21,7 +21,7 @@
 /*
  *	jfs_types.h:
  *
- * basic type/utility definitions
+ * basic type/utility  definitions
  *
  * note: this header file must be the 1st include file
  * of JFS include list in all JFS .c file.
@@ -54,8 +54,8 @@ struct timestruc_t {
  */
 
 #define LEFTMOSTONE	0x80000000
-#define	HIGHORDER	0x80000000u	/* high order bit on	*/
-#define	ONES		0xffffffffu	/* all bit on		*/
+#define	HIGHORDER	0x80000000u	/* high order bit on            */
+#define	ONES		0xffffffffu	/* all bit on                   */
 
 /*
  *	logical xd (lxd)
@@ -148,7 +148,7 @@ typedef struct {
 #define sizeDXD(dxd)	le32_to_cpu((dxd)->size)
 
 /*
- *	directory entry argument
+ *      directory entry argument
  */
 struct component_name {
 	int namlen;
@@ -160,14 +160,14 @@ struct component_name {
  *	DASD limit information - stored in directory inode
  */
 struct dasd {
-	u8 thresh;		/* Alert Threshold (in percent)		*/
-	u8 delta;		/* Alert Threshold delta (in percent)	*/
+	u8 thresh;		/* Alert Threshold (in percent) */
+	u8 delta;		/* Alert Threshold delta (in percent)   */
 	u8 rsrvd1;
-	u8 limit_hi;		/* DASD limit (in logical blocks)	*/
-	__le32 limit_lo;	/* DASD limit (in logical blocks)	*/
+	u8 limit_hi;		/* DASD limit (in logical blocks)       */
+	__le32 limit_lo;	/* DASD limit (in logical blocks)       */
 	u8 rsrvd2[3];
-	u8 used_hi;		/* DASD usage (in logical blocks)	*/
-	__le32 used_lo;		/* DASD usage (in logical blocks)	*/
+	u8 used_hi;		/* DASD usage (in logical blocks)       */
+	__le32 used_lo;		/* DASD usage (in logical blocks)       */
 };
 
 #define DASDLIMIT(dasdp) \
diff --git a/trunk/fs/jfs/jfs_umount.c b/trunk/fs/jfs/jfs_umount.c
index 7971f37534a3..a386f48c73fc 100644
--- a/trunk/fs/jfs/jfs_umount.c
+++ b/trunk/fs/jfs/jfs_umount.c
@@ -60,7 +60,7 @@ int jfs_umount(struct super_block *sb)
 	jfs_info("UnMount JFS: sb:0x%p", sb);
 
 	/*
-	 *	update superblock and close log
+	 *      update superblock and close log
 	 *
 	 * if mounted read-write and log based recovery was enabled
 	 */
diff --git a/trunk/fs/jfs/jfs_xtree.c b/trunk/fs/jfs/jfs_xtree.c
index 1543906a2e0d..acc97c46d8a4 100644
--- a/trunk/fs/jfs/jfs_xtree.c
+++ b/trunk/fs/jfs/jfs_xtree.c
@@ -16,7 +16,7 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 /*
- *	jfs_xtree.c: extent allocation descriptor B+-tree manager
+ *      jfs_xtree.c: extent allocation descriptor B+-tree manager
  */
 
 #include <linux/fs.h>
@@ -32,30 +32,30 @@
 /*
  * xtree local flag
  */
-#define XT_INSERT	0x00000001
+#define XT_INSERT       0x00000001
 
 /*
- *	xtree key/entry comparison: extent offset
+ *       xtree key/entry comparison: extent offset
  *
  * return:
- *	-1: k < start of extent
- *	 0: start_of_extent <= k <= end_of_extent
- *	 1: k > end_of_extent
+ *      -1: k < start of extent
+ *       0: start_of_extent <= k <= end_of_extent
+ *       1: k > end_of_extent
  */
 #define XT_CMP(CMP, K, X, OFFSET64)\
 {\
-	OFFSET64 = offsetXAD(X);\
-	(CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
-		((K) < OFFSET64) ? -1 : 0;\
+        OFFSET64 = offsetXAD(X);\
+        (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
+              ((K) < OFFSET64) ? -1 : 0;\
 }
 
 /* write a xad entry */
 #define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
 {\
-	(XAD)->flag = (FLAG);\
-	XADoffset((XAD), (OFF));\
-	XADlength((XAD), (LEN));\
-	XADaddress((XAD), (ADDR));\
+        (XAD)->flag = (FLAG);\
+        XADoffset((XAD), (OFF));\
+        XADlength((XAD), (LEN));\
+        XADaddress((XAD), (ADDR));\
 }
 
 #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
@@ -76,13 +76,13 @@
 			MP = NULL;\
 			RC = -EIO;\
 		}\
-	}\
+        }\
 }
 
 /* for consistency */
 #define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
 
-#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
+#define XT_GETSEARCH(IP, LEAF, BN, MP,  P, INDEX) \
 	BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
 /* xtree entry parameter descriptor */
 struct xtsplit {
@@ -97,7 +97,7 @@ struct xtsplit {
 
 
 /*
- *	statistics
+ *      statistics
  */
 #ifdef CONFIG_JFS_STATISTICS
 static struct {
@@ -136,7 +136,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
 #endif				/*  _STILL_TO_PORT */
 
 /*
- *	xtLookup()
+ *      xtLookup()
  *
  * function: map a single page into a physical extent;
  */
@@ -179,7 +179,7 @@ int xtLookup(struct inode *ip, s64 lstart,
 	}
 
 	/*
-	 *	compute the physical extent covering logical extent
+	 *      compute the physical extent covering logical extent
 	 *
 	 * N.B. search may have failed (e.g., hole in sparse file),
 	 * and returned the index of the next entry.
@@ -220,27 +220,27 @@ int xtLookup(struct inode *ip, s64 lstart,
 
 
 /*
- *	xtLookupList()
+ *      xtLookupList()
  *
  * function: map a single logical extent into a list of physical extent;
  *
  * parameter:
- *	struct inode	*ip,
- *	struct lxdlist	*lxdlist,	lxd list (in)
- *	struct xadlist	*xadlist,	xad list (in/out)
- *	int		flag)
+ *      struct inode    *ip,
+ *      struct lxdlist  *lxdlist,       lxd list (in)
+ *      struct xadlist  *xadlist,       xad list (in/out)
+ *      int		flag)
  *
  * coverage of lxd by xad under assumption of
  * . lxd's are ordered and disjoint.
  * . xad's are ordered and disjoint.
  *
  * return:
- *	0:	success
+ *      0:      success
  *
  * note: a page being written (even a single byte) is backed fully,
- *	except the last page which is only backed with blocks
- *	required to cover the last byte;
- *	the extent backing a page is fully contained within an xad;
+ *      except the last page which is only backed with blocks
+ *      required to cover the last byte;
+ *      the extent backing a page is fully contained within an xad;
  */
 int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
 		 struct xadlist * xadlist, int flag)
@@ -284,7 +284,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
 		return rc;
 
 	/*
-	 *	compute the physical extent covering logical extent
+	 *      compute the physical extent covering logical extent
 	 *
 	 * N.B. search may have failed (e.g., hole in sparse file),
 	 * and returned the index of the next entry.
@@ -343,7 +343,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
 		if (lstart >= size)
 			goto mapend;
 
-		/* compare with the current xad */
+		/* compare with the current xad  */
 		goto compare1;
 	}
 	/* lxd is covered by xad */
@@ -430,7 +430,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
 	/*
 	 * lxd is partially covered by xad
 	 */
-	else {			/* (xend < lend) */
+	else {			/* (xend < lend)  */
 
 		/*
 		 * get next xad
@@ -477,22 +477,22 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
 
 
 /*
- *	xtSearch()
+ *      xtSearch()
  *
- * function:	search for the xad entry covering specified offset.
+ * function:    search for the xad entry covering specified offset.
  *
  * parameters:
- *	ip	- file object;
- *	xoff	- extent offset;
- *	nextp	- address of next extent (if any) for search miss
- *	cmpp	- comparison result:
- *	btstack - traverse stack;
- *	flag	- search process flag (XT_INSERT);
+ *      ip      - file object;
+ *      xoff    - extent offset;
+ *      nextp	- address of next extent (if any) for search miss
+ *      cmpp    - comparison result:
+ *      btstack - traverse stack;
+ *      flag    - search process flag (XT_INSERT);
  *
  * returns:
- *	btstack contains (bn, index) of search path traversed to the entry.
- *	*cmpp is set to result of comparison with the entry returned.
- *	the page containing the entry is pinned at exit.
+ *      btstack contains (bn, index) of search path traversed to the entry.
+ *      *cmpp is set to result of comparison with the entry returned.
+ *      the page containing the entry is pinned at exit.
  */
 static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
 		    int *cmpp, struct btstack * btstack, int flag)
@@ -517,7 +517,7 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
 	btstack->nsplit = 0;
 
 	/*
-	 *	search down tree from root:
+	 *      search down tree from root:
 	 *
 	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
 	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -642,7 +642,7 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
 			XT_CMP(cmp, xoff, &p->xad[index], t64);
 			if (cmp == 0) {
 				/*
-				 *	search hit
+				 *      search hit
 				 */
 				/* search hit - leaf page:
 				 * return the entry found
@@ -692,7 +692,7 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
 		}
 
 		/*
-		 *	search miss
+		 *      search miss
 		 *
 		 * base is the smallest index with key (Kj) greater than
 		 * search key (K) and may be zero or maxentry index.
@@ -773,22 +773,22 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
 }
 
 /*
- *	xtInsert()
+ *      xtInsert()
  *
  * function:
  *
  * parameter:
- *	tid	- transaction id;
- *	ip	- file object;
- *	xflag	- extent flag (XAD_NOTRECORDED):
- *	xoff	- extent offset;
- *	xlen	- extent length;
- *	xaddrp	- extent address pointer (in/out):
- *		if (*xaddrp)
- *			caller allocated data extent at *xaddrp;
- *		else
- *			allocate data extent and return its xaddr;
- *	flag	-
+ *      tid     - transaction id;
+ *      ip      - file object;
+ *      xflag   - extent flag (XAD_NOTRECORDED):
+ *      xoff    - extent offset;
+ *      xlen    - extent length;
+ *      xaddrp  - extent address pointer (in/out):
+ *              if (*xaddrp)
+ *                      caller allocated data extent at *xaddrp;
+ *              else
+ *                      allocate data extent and return its xaddr;
+ *      flag    -
  *
  * return:
  */
@@ -813,7 +813,7 @@ int xtInsert(tid_t tid,		/* transaction id */
 	jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
 
 	/*
-	 *	search for the entry location at which to insert:
+	 *      search for the entry location at which to insert:
 	 *
 	 * xtFastSearch() and xtSearch() both returns (leaf page
 	 * pinned, index at which to insert).
@@ -853,13 +853,13 @@ int xtInsert(tid_t tid,		/* transaction id */
 	}
 
 	/*
-	 *	insert entry for new extent
+	 *      insert entry for new extent
 	 */
 	xflag |= XAD_NEW;
 
 	/*
-	 *	if the leaf page is full, split the page and
-	 *	propagate up the router entry for the new page from split
+	 *      if the leaf page is full, split the page and
+	 *      propagate up the router entry for the new page from split
 	 *
 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
 	 */
@@ -886,7 +886,7 @@ int xtInsert(tid_t tid,		/* transaction id */
 	}
 
 	/*
-	 *	insert the new entry into the leaf page
+	 *      insert the new entry into the leaf page
 	 */
 	/*
 	 * acquire a transaction lock on the leaf page;
@@ -930,16 +930,16 @@ int xtInsert(tid_t tid,		/* transaction id */
 
 
 /*
- *	xtSplitUp()
+ *      xtSplitUp()
  *
  * function:
- *	split full pages as propagating insertion up the tree
+ *      split full pages as propagating insertion up the tree
  *
  * parameter:
- *	tid	- transaction id;
- *	ip	- file object;
- *	split	- entry parameter descriptor;
- *	btstack - traverse stack from xtSearch()
+ *      tid     - transaction id;
+ *      ip      - file object;
+ *      split   - entry parameter descriptor;
+ *      btstack - traverse stack from xtSearch()
  *
  * return:
  */
@@ -1199,22 +1199,22 @@ xtSplitUp(tid_t tid,
 
 
 /*
- *	xtSplitPage()
+ *      xtSplitPage()
  *
  * function:
- *	split a full non-root page into
- *	original/split/left page and new right page
- *	i.e., the original/split page remains as left page.
+ *      split a full non-root page into
+ *      original/split/left page and new right page
+ *      i.e., the original/split page remains as left page.
  *
  * parameter:
- *	int		tid,
- *	struct inode	*ip,
- *	struct xtsplit	*split,
- *	struct metapage	**rmpp,
- *	u64		*rbnp,
+ *      int		tid,
+ *      struct inode    *ip,
+ *      struct xtsplit  *split,
+ *      struct metapage	**rmpp,
+ *      u64		*rbnp,
  *
  * return:
- *	Pointer to page in which to insert or NULL on error.
+ *      Pointer to page in which to insert or NULL on error.
  */
 static int
 xtSplitPage(tid_t tid, struct inode *ip,
@@ -1248,9 +1248,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
 	rbn = addressPXD(pxd);
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
-		rc = -EDQUOT;
-		goto clean_up;
+       if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+	       rc = -EDQUOT;
+	       goto clean_up;
 	}
 
 	quota_allocation += lengthPXD(pxd);
@@ -1304,7 +1304,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
 	skip = split->index;
 
 	/*
-	 *	sequential append at tail (after last entry of last page)
+	 *      sequential append at tail (after last entry of last page)
 	 *
 	 * if splitting the last page on a level because of appending
 	 * a entry to it (skip is maxentry), it's likely that the access is
@@ -1342,7 +1342,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
 	}
 
 	/*
-	 *	non-sequential insert (at possibly middle page)
+	 *      non-sequential insert (at possibly middle page)
 	 */
 
 	/*
@@ -1465,24 +1465,25 @@ xtSplitPage(tid_t tid, struct inode *ip,
 
 
 /*
- *	xtSplitRoot()
+ *      xtSplitRoot()
  *
  * function:
- *	split the full root page into original/root/split page and new
- *	right page
- *	i.e., root remains fixed in tree anchor (inode) and the root is
- *	copied to a single new right child page since root page <<
- *	non-root page, and the split root page contains a single entry
- *	for the new right child page.
+ *      split the full root page into
+ *      original/root/split page and new right page
+ *      i.e., root remains fixed in tree anchor (inode) and
+ *      the root is copied to a single new right child page
+ *      since root page << non-root page, and
+ *      the split root page contains a single entry for the
+ *      new right child page.
  *
  * parameter:
- *	int		tid,
- *	struct inode	*ip,
- *	struct xtsplit	*split,
- *	struct metapage	**rmpp)
+ *      int		tid,
+ *      struct inode    *ip,
+ *      struct xtsplit  *split,
+ *      struct metapage	**rmpp)
  *
  * return:
- *	Pointer to page in which to insert or NULL on error.
+ *      Pointer to page in which to insert or NULL on error.
  */
 static int
 xtSplitRoot(tid_t tid,
@@ -1504,7 +1505,7 @@ xtSplitRoot(tid_t tid,
 	INCREMENT(xtStat.split);
 
 	/*
-	 *	allocate a single (right) child page
+	 *      allocate a single (right) child page
 	 */
 	pxdlist = split->pxdlist;
 	pxd = &pxdlist->pxd[pxdlist->npxd];
@@ -1572,7 +1573,7 @@ xtSplitRoot(tid_t tid,
 	}
 
 	/*
-	 *	reset the root
+	 *      reset the root
 	 *
 	 * init root with the single entry for the new right page
 	 * set the 1st entry offset to 0, which force the left-most key
@@ -1609,7 +1610,7 @@ xtSplitRoot(tid_t tid,
 
 
 /*
- *	xtExtend()
+ *      xtExtend()
  *
  * function: extend in-place;
  *
@@ -1676,7 +1677,7 @@ int xtExtend(tid_t tid,		/* transaction id */
 		goto extendOld;
 
 	/*
-	 *	extent overflow: insert entry for new extent
+	 *      extent overflow: insert entry for new extent
 	 */
 //insertNew:
 	xoff = offsetXAD(xad) + MAXXLEN;
@@ -1684,8 +1685,8 @@ int xtExtend(tid_t tid,		/* transaction id */
 	nextindex = le16_to_cpu(p->header.nextindex);
 
 	/*
-	 *	if the leaf page is full, insert the new entry and
-	 *	propagate up the router entry for the new page from split
+	 *      if the leaf page is full, insert the new entry and
+	 *      propagate up the router entry for the new page from split
 	 *
 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
 	 */
@@ -1730,7 +1731,7 @@ int xtExtend(tid_t tid,		/* transaction id */
 		}
 	}
 	/*
-	 *	insert the new entry into the leaf page
+	 *      insert the new entry into the leaf page
 	 */
 	else {
 		/* insert the new entry: mark the entry NEW */
@@ -1770,11 +1771,11 @@ int xtExtend(tid_t tid,		/* transaction id */
 
 #ifdef _NOTYET
 /*
- *	xtTailgate()
+ *      xtTailgate()
  *
  * function: split existing 'tail' extent
- *	(split offset >= start offset of tail extent), and
- *	relocate and extend the split tail half;
+ *      (split offset >= start offset of tail extent), and
+ *      relocate and extend the split tail half;
  *
  * note: existing extent may or may not have been committed.
  * caller is responsible for pager buffer cache update, and
@@ -1803,7 +1804,7 @@ int xtTailgate(tid_t tid,		/* transaction id */
 
 /*
 printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
-	(ulong)xoff, xlen, (ulong)xaddr);
+        (ulong)xoff, xlen, (ulong)xaddr);
 */
 
 	/* there must exist extent to be tailgated */
@@ -1841,18 +1842,18 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
 	xad = &p->xad[index];
 /*
 printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
-	(ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
+        (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
 */
 	if ((llen = xoff - offsetXAD(xad)) == 0)
 		goto updateOld;
 
 	/*
-	 *	partially replace extent: insert entry for new extent
+	 *      partially replace extent: insert entry for new extent
 	 */
 //insertNew:
 	/*
-	 *	if the leaf page is full, insert the new entry and
-	 *	propagate up the router entry for the new page from split
+	 *      if the leaf page is full, insert the new entry and
+	 *      propagate up the router entry for the new page from split
 	 *
 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
 	 */
@@ -1897,7 +1898,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
 		}
 	}
 	/*
-	 *	insert the new entry into the leaf page
+	 *      insert the new entry into the leaf page
 	 */
 	else {
 		/* insert the new entry: mark the entry NEW */
@@ -1954,17 +1955,17 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
 #endif /* _NOTYET */
 
 /*
- *	xtUpdate()
+ *      xtUpdate()
  *
  * function: update XAD;
  *
- *	update extent for allocated_but_not_recorded or
- *	compressed extent;
+ *      update extent for allocated_but_not_recorded or
+ *      compressed extent;
  *
  * parameter:
- *	nxad	- new XAD;
- *		logical extent of the specified XAD must be completely
- *		contained by an existing XAD;
+ *      nxad    - new XAD;
+ *                logical extent of the specified XAD must be completely
+ *                contained by an existing XAD;
  */
 int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
 {				/* new XAD */
@@ -2415,19 +2416,19 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p);
 
 
 /*
- *	xtAppend()
+ *      xtAppend()
  *
  * function: grow in append mode from contiguous region specified ;
  *
  * parameter:
- *	tid		- transaction id;
- *	ip		- file object;
- *	xflag		- extent flag:
- *	xoff		- extent offset;
- *	maxblocks	- max extent length;
- *	xlen		- extent length (in/out);
- *	xaddrp		- extent address pointer (in/out):
- *	flag		-
+ *      tid             - transaction id;
+ *      ip              - file object;
+ *      xflag           - extent flag:
+ *      xoff            - extent offset;
+ *      maxblocks       - max extent length;
+ *      xlen            - extent length (in/out);
+ *      xaddrp          - extent address pointer (in/out):
+ *      flag            -
  *
  * return:
  */
@@ -2459,7 +2460,7 @@ int xtAppend(tid_t tid,		/* transaction id */
 		 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
 
 	/*
-	 *	search for the entry location at which to insert:
+	 *      search for the entry location at which to insert:
 	 *
 	 * xtFastSearch() and xtSearch() both returns (leaf page
 	 * pinned, index at which to insert).
@@ -2481,13 +2482,13 @@ int xtAppend(tid_t tid,		/* transaction id */
 		xlen = min(xlen, (int)(next - xoff));
 //insert:
 	/*
-	 *	insert entry for new extent
+	 *      insert entry for new extent
 	 */
 	xflag |= XAD_NEW;
 
 	/*
-	 *	if the leaf page is full, split the page and
-	 *	propagate up the router entry for the new page from split
+	 *      if the leaf page is full, split the page and
+	 *      propagate up the router entry for the new page from split
 	 *
 	 * The xtSplitUp() will insert the entry and unpin the leaf page.
 	 */
@@ -2544,7 +2545,7 @@ int xtAppend(tid_t tid,		/* transaction id */
 	return 0;
 
 	/*
-	 *	insert the new entry into the leaf page
+	 *      insert the new entry into the leaf page
 	 */
       insertLeaf:
 	/*
@@ -2588,17 +2589,17 @@ int xtAppend(tid_t tid,		/* transaction id */
 
 /* - TBD for defragmentaion/reorganization -
  *
- *	xtDelete()
+ *      xtDelete()
  *
  * function:
- *	delete the entry with the specified key.
+ *      delete the entry with the specified key.
  *
- *	N.B.: whole extent of the entry is assumed to be deleted.
+ *      N.B.: whole extent of the entry is assumed to be deleted.
  *
  * parameter:
  *
  * return:
- *	ENOENT: if the entry is not found.
+ *       ENOENT: if the entry is not found.
  *
  * exception:
  */
@@ -2664,10 +2665,10 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
 
 /* - TBD for defragmentaion/reorganization -
  *
- *	xtDeleteUp()
+ *      xtDeleteUp()
  *
  * function:
- *	free empty pages as propagating deletion up the tree
+ *      free empty pages as propagating deletion up the tree
  *
  * parameter:
  *
@@ -2814,15 +2815,15 @@ xtDeleteUp(tid_t tid, struct inode *ip,
 
 
 /*
- * NAME:	xtRelocate()
+ * NAME:        xtRelocate()
  *
- * FUNCTION:	relocate xtpage or data extent of regular file;
- *		This function is mainly used by defragfs utility.
+ * FUNCTION:    relocate xtpage or data extent of regular file;
+ *              This function is mainly used by defragfs utility.
  *
- * NOTE:	This routine does not have the logic to handle
- *		uncommitted allocated extent. The caller should call
- *		txCommit() to commit all the allocation before call
- *		this routine.
+ * NOTE:        This routine does not have the logic to handle
+ *              uncommitted allocated extent. The caller should call
+ *              txCommit() to commit all the allocation before call
+ *              this routine.
  */
 int
 xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
@@ -2864,8 +2865,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 		 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
 
 	/*
-	 *	1. get and validate the parent xtpage/xad entry
-	 *	covering the source extent to be relocated;
+	 *      1. get and validate the parent xtpage/xad entry
+	 *      covering the source extent to be relocated;
 	 */
 	if (xtype == DATAEXT) {
 		/* search in leaf entry */
@@ -2909,7 +2910,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 	jfs_info("xtRelocate: parent xad entry validated.");
 
 	/*
-	 *	2. relocate the extent
+	 *      2. relocate the extent
 	 */
 	if (xtype == DATAEXT) {
 		/* if the extent is allocated-but-not-recorded
@@ -2922,7 +2923,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 			XT_PUTPAGE(pmp);
 
 		/*
-		 *	cmRelocate()
+		 *      cmRelocate()
 		 *
 		 * copy target data pages to be relocated;
 		 *
@@ -2944,8 +2945,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 		pno = offset >> CM_L2BSIZE;
 		npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
 /*
-		npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
-			  (offset >> CM_L2BSIZE) + 1;
+                npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
+                         (offset >> CM_L2BSIZE) + 1;
 */
 		sxaddr = oxaddr;
 		dxaddr = nxaddr;
@@ -2980,7 +2981,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 
 		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
 		jfs_info("xtRelocate: target data extent relocated.");
-	} else {		/* (xtype == XTPAGE) */
+	} else {		/* (xtype  == XTPAGE) */
 
 		/*
 		 * read in the target xtpage from the source extent;
@@ -3025,14 +3026,16 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 		 */
 		if (lmp) {
 			BT_MARK_DIRTY(lmp, ip);
-			tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
+			tlck =
+			    txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
 			lp->header.next = cpu_to_le64(nxaddr);
 			XT_PUTPAGE(lmp);
 		}
 
 		if (rmp) {
 			BT_MARK_DIRTY(rmp, ip);
-			tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
+			tlck =
+			    txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
 			rp->header.prev = cpu_to_le64(nxaddr);
 			XT_PUTPAGE(rmp);
 		}
@@ -3059,7 +3062,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 		 * scan may be skipped by commit() and logredo();
 		 */
 		BT_MARK_DIRTY(mp, ip);
-		/* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
+		/* tlckNEW init  xtlck->lwm.offset = XTENTRYSTART; */
 		tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
 		xtlck = (struct xtlock *) & tlck->lock;
 
@@ -3081,7 +3084,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 	}
 
 	/*
-	 *	3. acquire maplock for the source extent to be freed;
+	 *      3. acquire maplock for the source extent to be freed;
 	 *
 	 * acquire a maplock saving the src relocated extent address;
 	 * to free of the extent at commit time;
@@ -3102,7 +3105,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 	 *      is no buffer associated with this lock since the buffer
 	 *      has been redirected to the target location.
 	 */
-	else			/* (xtype == XTPAGE) */
+	else			/* (xtype  == XTPAGE) */
 		tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
 
 	pxdlock = (struct pxd_lock *) & tlck->lock;
@@ -3112,7 +3115,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 	pxdlock->index = 1;
 
 	/*
-	 *	4. update the parent xad entry for relocation;
+	 *      4. update the parent xad entry for relocation;
 	 *
 	 * acquire tlck for the parent entry with XAD_NEW as entry
 	 * update which will write LOG_REDOPAGE and update bmap for
@@ -3140,22 +3143,22 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 
 
 /*
- *	xtSearchNode()
+ *      xtSearchNode()
  *
- * function:	search for the internal xad entry covering specified extent.
- *		This function is mainly used by defragfs utility.
+ * function:    search for the internal xad entry covering specified extent.
+ *              This function is mainly used by defragfs utility.
  *
  * parameters:
- *	ip	- file object;
- *	xad	- extent to find;
- *	cmpp	- comparison result:
- *	btstack - traverse stack;
- *	flag	- search process flag;
+ *      ip      - file object;
+ *      xad     - extent to find;
+ *      cmpp    - comparison result:
+ *      btstack - traverse stack;
+ *      flag    - search process flag;
  *
  * returns:
- *	btstack contains (bn, index) of search path traversed to the entry.
- *	*cmpp is set to result of comparison with the entry returned.
- *	the page containing the entry is pinned at exit.
+ *      btstack contains (bn, index) of search path traversed to the entry.
+ *      *cmpp is set to result of comparison with the entry returned.
+ *      the page containing the entry is pinned at exit.
  */
 static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
 			int *cmpp, struct btstack * btstack, int flag)
@@ -3178,7 +3181,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
 	xaddr = addressXAD(xad);
 
 	/*
-	 *	search down tree from root:
+	 *      search down tree from root:
 	 *
 	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
 	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -3214,7 +3217,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
 			XT_CMP(cmp, xoff, &p->xad[index], t64);
 			if (cmp == 0) {
 				/*
-				 *	search hit
+				 *      search hit
 				 *
 				 * verify for exact match;
 				 */
@@ -3242,7 +3245,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
 		}
 
 		/*
-		 *	search miss - non-leaf page:
+		 *      search miss - non-leaf page:
 		 *
 		 * base is the smallest index with key (Kj) greater than
 		 * search key (K) and may be zero or maxentry index.
@@ -3265,15 +3268,15 @@ static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
 
 
 /*
- *	xtRelink()
+ *      xtRelink()
  *
  * function:
- *	link around a freed page.
+ *      link around a freed page.
  *
  * Parameter:
- *	int		tid,
- *	struct inode	*ip,
- *	xtpage_t	*p)
+ *      int           tid,
+ *      struct inode    *ip,
+ *      xtpage_t        *p)
  *
  * returns:
  */
@@ -3335,7 +3338,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
 
 
 /*
- *	xtInitRoot()
+ *      xtInitRoot()
  *
  * initialize file root (inline in inode)
  */
@@ -3382,42 +3385,42 @@ void xtInitRoot(tid_t tid, struct inode *ip)
 #define MAX_TRUNCATE_LEAVES 50
 
 /*
- *	xtTruncate()
+ *      xtTruncate()
  *
  * function:
- *	traverse for truncation logging backward bottom up;
- *	terminate at the last extent entry at the current subtree
- *	root page covering new down size.
- *	truncation may occur within the last extent entry.
+ *      traverse for truncation logging backward bottom up;
+ *      terminate at the last extent entry at the current subtree
+ *      root page covering new down size.
+ *      truncation may occur within the last extent entry.
  *
  * parameter:
- *	int		tid,
- *	struct inode	*ip,
- *	s64		newsize,
- *	int		type)	{PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
+ *      int           tid,
+ *      struct inode    *ip,
+ *      s64           newsize,
+ *      int           type)   {PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
  *
  * return:
  *
  * note:
- *	PWMAP:
- *	 1. truncate (non-COMMIT_NOLINK file)
- *	    by jfs_truncate() or jfs_open(O_TRUNC):
- *	    xtree is updated;
+ *      PWMAP:
+ *       1. truncate (non-COMMIT_NOLINK file)
+ *          by jfs_truncate() or jfs_open(O_TRUNC):
+ *          xtree is updated;
  *	 2. truncate index table of directory when last entry removed
- *	map update via tlock at commit time;
- *	PMAP:
+ *       map update via tlock at commit time;
+ *      PMAP:
  *	 Call xtTruncate_pmap instead
- *	WMAP:
- *	 1. remove (free zero link count) on last reference release
- *	    (pmap has been freed at commit zero link count);
- *	 2. truncate (COMMIT_NOLINK file, i.e., tmp file):
- *	    xtree is updated;
- *	 map update directly at truncation time;
+ *      WMAP:
+ *       1. remove (free zero link count) on last reference release
+ *          (pmap has been freed at commit zero link count);
+ *       2. truncate (COMMIT_NOLINK file, i.e., tmp file):
+ *          xtree is updated;
+ *       map update directly at truncation time;
  *
- *	if (DELETE)
- *		no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
- *	else if (TRUNCATE)
- *		must write LOG_NOREDOPAGE for deleted index page;
+ *      if (DELETE)
+ *              no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
+ *      else if (TRUNCATE)
+ *              must write LOG_NOREDOPAGE for deleted index page;
  *
  * pages may already have been tlocked by anonymous transactions
  * during file growth (i.e., write) before truncation;
@@ -3490,7 +3493,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 	 * retained in the new sized file.
 	 * if type is PMAP, the data and index pages are NOT
 	 * freed, and the data and index blocks are NOT freed
-	 * from working map.
+	 * from  working map.
 	 * (this will allow continued access of data/index of
 	 * temporary file (zerolink count file truncated to zero-length)).
 	 */
@@ -3539,7 +3542,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 		goto getChild;
 
 	/*
-	 *	leaf page
+	 *      leaf page
 	 */
 	freed = 0;
 
@@ -3913,7 +3916,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 	}
 
 	/*
-	 *	internal page: go down to child page of current entry
+	 *      internal page: go down to child page of current entry
 	 */
       getChild:
 	/* save current parent entry for the child page */
@@ -3962,7 +3965,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 
 
 /*
- *	xtTruncate_pmap()
+ *      xtTruncate_pmap()
  *
  * function:
  *	Perform truncate to zero lenghth for deleted file, leaving the
@@ -3971,9 +3974,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
  *	is committed to disk.
  *
  * parameter:
- *	tid_t		tid,
- *	struct inode	*ip,
- *	s64		committed_size)
+ *      tid_t		tid,
+ *      struct inode	*ip,
+ *      s64		committed_size)
  *
  * return: new committed size
  *
@@ -4047,7 +4050,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 	}
 
 	/*
-	 *	leaf page
+	 *      leaf page
 	 */
 
 	if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
@@ -4059,7 +4062,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 		xoff = offsetXAD(xad);
 		xlen = lengthXAD(xad);
 		XT_PUTPAGE(mp);
-		return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+		return  (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
 	}
 	tlck = txLock(tid, ip, mp, tlckXTREE);
 	tlck->type = tlckXTREE | tlckFREE;
@@ -4096,7 +4099,8 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 		 */
 		tlck = txLock(tid, ip, mp, tlckXTREE);
 		xtlck = (struct xtlock *) & tlck->lock;
-		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
+		xtlck->hwm.offset =
+		    le16_to_cpu(p->header.nextindex) - 1;
 		tlck->type = tlckXTREE | tlckFREE;
 
 		XT_PUTPAGE(mp);
@@ -4114,7 +4118,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 	else
 		index--;
 	/*
-	 *	internal page: go down to child page of current entry
+	 *      internal page: go down to child page of current entry
 	 */
       getChild:
 	/* save current parent entry for the child page */
diff --git a/trunk/fs/jfs/jfs_xtree.h b/trunk/fs/jfs/jfs_xtree.h
index 70815c8a3d6a..164f6f2b1019 100644
--- a/trunk/fs/jfs/jfs_xtree.h
+++ b/trunk/fs/jfs/jfs_xtree.h
@@ -19,14 +19,14 @@
 #define _H_JFS_XTREE
 
 /*
- *	jfs_xtree.h: extent allocation descriptor B+-tree manager
+ *      jfs_xtree.h: extent allocation descriptor B+-tree manager
  */
 
 #include "jfs_btree.h"
 
 
 /*
- *	extent allocation descriptor (xad)
+ *      extent allocation descriptor (xad)
  */
 typedef struct xad {
 	unsigned flag:8;	/* 1: flag */
@@ -38,30 +38,30 @@ typedef struct xad {
 	__le32 addr2;		/* 4: address in unit of fsblksize */
 } xad_t;			/* (16) */
 
-#define MAXXLEN		((1 << 24) - 1)
+#define MAXXLEN         ((1 << 24) - 1)
 
-#define XTSLOTSIZE	16
-#define L2XTSLOTSIZE	4
+#define XTSLOTSIZE      16
+#define L2XTSLOTSIZE    4
 
 /* xad_t field construction */
 #define XADoffset(xad, offset64)\
 {\
-	(xad)->off1 = ((u64)offset64) >> 32;\
-	(xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
+        (xad)->off1 = ((u64)offset64) >> 32;\
+        (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
 }
 #define XADaddress(xad, address64)\
 {\
-	(xad)->addr1 = ((u64)address64) >> 32;\
-	(xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+        (xad)->addr1 = ((u64)address64) >> 32;\
+        (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
 }
-#define XADlength(xad, length32)	(xad)->len = __cpu_to_le24(length32)
+#define XADlength(xad, length32)        (xad)->len = __cpu_to_le24(length32)
 
 /* xad_t field extraction */
 #define offsetXAD(xad)\
-	( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
+        ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
 #define addressXAD(xad)\
-	( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
-#define lengthXAD(xad)	__le24_to_cpu((xad)->len)
+        ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
+#define lengthXAD(xad)  __le24_to_cpu((xad)->len)
 
 /* xad list */
 struct xadlist {
@@ -71,22 +71,22 @@ struct xadlist {
 };
 
 /* xad_t flags */
-#define XAD_NEW		0x01	/* new */
-#define XAD_EXTENDED	0x02	/* extended */
-#define XAD_COMPRESSED	0x04	/* compressed with recorded length */
+#define XAD_NEW         0x01	/* new */
+#define XAD_EXTENDED    0x02	/* extended */
+#define XAD_COMPRESSED  0x04	/* compressed with recorded length */
 #define XAD_NOTRECORDED 0x08	/* allocated but not recorded */
-#define XAD_COW		0x10	/* copy-on-write */
+#define XAD_COW         0x10	/* copy-on-write */
 
 
 /* possible values for maxentry */
-#define XTROOTINITSLOT_DIR 6
-#define XTROOTINITSLOT	10
-#define XTROOTMAXSLOT	18
-#define XTPAGEMAXSLOT	256
-#define XTENTRYSTART	2
+#define XTROOTINITSLOT_DIR  6
+#define XTROOTINITSLOT  10
+#define XTROOTMAXSLOT   18
+#define XTPAGEMAXSLOT   256
+#define XTENTRYSTART    2
 
 /*
- *	xtree page:
+ *      xtree page:
  */
 typedef union {
 	struct xtheader {
@@ -106,7 +106,7 @@ typedef union {
 } xtpage_t;
 
 /*
- *	external declaration
+ *      external declaration
  */
 extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
 		    int *pflag, s64 * paddr, int *plen, int flag);
diff --git a/trunk/fs/jfs/namei.c b/trunk/fs/jfs/namei.c
index 25161c4121e4..41c204771262 100644
--- a/trunk/fs/jfs/namei.c
+++ b/trunk/fs/jfs/namei.c
@@ -328,7 +328,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
  *		dentry	- child directory dentry
  *
  * RETURN:	-EINVAL	- if name is . or ..
- *		-EINVAL - if . or .. exist but are invalid.
+ *		-EINVAL  - if . or .. exist but are invalid.
  *		errors from subroutines
  *
  * note:
@@ -517,7 +517,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 	inode_dec_link_count(ip);
 
 	/*
-	 *	commit zero link count object
+	 *      commit zero link count object
 	 */
 	if (ip->i_nlink == 0) {
 		assert(!test_cflag(COMMIT_Nolink, ip));
@@ -596,7 +596,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 /*
  * NAME:	commitZeroLink()
  *
- * FUNCTION:	for non-directory, called by jfs_remove(),
+ * FUNCTION:    for non-directory, called by jfs_remove(),
  *		truncate a regular file, directory or symbolic
  *		link to zero length. return 0 if type is not
  *		one of these.
@@ -676,7 +676,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip)
 /*
  * NAME:	jfs_free_zero_link()
  *
- * FUNCTION:	for non-directory, called by iClose(),
+ * FUNCTION:    for non-directory, called by iClose(),
  *		free resources of a file from cache and WORKING map
  *		for a file previously committed with zero link count
  *		while associated with a pager object,
@@ -855,12 +855,12 @@ static int jfs_link(struct dentry *old_dentry,
  * NAME:	jfs_symlink(dip, dentry, name)
  *
  * FUNCTION:	creates a symbolic link to <symlink> by name <name>
- *			in directory <dip>
+ *		        in directory <dip>
  *
- * PARAMETER:	dip	- parent directory vnode
- *		dentry	- dentry of symbolic link
- *		name	- the path name of the existing object
- *			  that will be the source of the link
+ * PARAMETER:	dip	    - parent directory vnode
+ *		        dentry	- dentry of symbolic link
+ *		        name    - the path name of the existing object
+ *			              that will be the source of the link
  *
  * RETURN:	errors from subroutines
  *
@@ -1052,9 +1052,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 
 
 /*
- * NAME:	jfs_rename
+ * NAME:        jfs_rename
  *
- * FUNCTION:	rename a file or directory
+ * FUNCTION:    rename a file or directory
  */
 static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	       struct inode *new_dir, struct dentry *new_dentry)
@@ -1331,9 +1331,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 
 /*
- * NAME:	jfs_mknod
+ * NAME:        jfs_mknod
  *
- * FUNCTION:	Create a special file (device)
+ * FUNCTION:    Create a special file (device)
  */
 static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 		int mode, dev_t rdev)
diff --git a/trunk/fs/jfs/resize.c b/trunk/fs/jfs/resize.c
index 71984ee95346..79d625f3f733 100644
--- a/trunk/fs/jfs/resize.c
+++ b/trunk/fs/jfs/resize.c
@@ -29,17 +29,17 @@
 #include "jfs_txnmgr.h"
 #include "jfs_debug.h"
 
-#define BITSPERPAGE	(PSIZE << 3)
-#define L2MEGABYTE	20
-#define MEGABYTE	(1 << L2MEGABYTE)
-#define MEGABYTE32	(MEGABYTE << 5)
+#define BITSPERPAGE     (PSIZE << 3)
+#define L2MEGABYTE      20
+#define MEGABYTE        (1 << L2MEGABYTE)
+#define MEGABYTE32     (MEGABYTE << 5)
 
 /* convert block number to bmap file page number */
 #define BLKTODMAPN(b)\
-	(((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
+        (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
 
 /*
- *	jfs_extendfs()
+ *      jfs_extendfs()
  *
  * function: extend file system;
  *
@@ -48,9 +48,9 @@
  *                                   workspace  space
  *
  * input:
- *	new LVSize: in LV blocks (required)
- *	new LogSize: in LV blocks (optional)
- *	new FSSize: in LV blocks (optional)
+ *      new LVSize: in LV blocks (required)
+ *      new LogSize: in LV blocks (optional)
+ *      new FSSize: in LV blocks (optional)
  *
  * new configuration:
  * 1. set new LogSize as specified or default from new LVSize;
@@ -125,8 +125,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	}
 
 	/*
-	 *	reconfigure LV spaces
-	 *	---------------------
+	 *      reconfigure LV spaces
+	 *      ---------------------
 	 *
 	 * validate new size, or, if not specified, determine new size
 	 */
@@ -198,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 		log_formatted = 1;
 	}
 	/*
-	 *	quiesce file system
+	 *      quiesce file system
 	 *
 	 * (prepare to move the inline log and to prevent map update)
 	 *
@@ -270,8 +270,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	}
 
 	/*
-	 *	extend block allocation map
-	 *	---------------------------
+	 *      extend block allocation map
+	 *      ---------------------------
 	 *
 	 * extendfs() for new extension, retry after crash recovery;
 	 *
@@ -283,7 +283,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	 *  s_size: aggregate size in physical blocks;
 	 */
 	/*
-	 *	compute the new block allocation map configuration
+	 *      compute the new block allocation map configuration
 	 *
 	 * map dinode:
 	 *  di_size: map file size in byte;
@@ -301,7 +301,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	newNpages = BLKTODMAPN(t64) + 1;
 
 	/*
-	 *	extend map from current map (WITHOUT growing mapfile)
+	 *      extend map from current map (WITHOUT growing mapfile)
 	 *
 	 * map new extension with unmapped part of the last partial
 	 * dmap page, if applicable, and extra page(s) allocated
@@ -341,8 +341,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	XSize -= nblocks;
 
 	/*
-	 *	grow map file to cover remaining extension
-	 *	and/or one extra dmap page for next extendfs();
+	 *      grow map file to cover remaining extension
+	 *      and/or one extra dmap page for next extendfs();
 	 *
 	 * allocate new map pages and its backing blocks, and
 	 * update map file xtree
@@ -422,8 +422,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	dbFinalizeBmap(ipbmap);
 
 	/*
-	 *	update inode allocation map
-	 *	---------------------------
+	 *      update inode allocation map
+	 *      ---------------------------
 	 *
 	 * move iag lists from old to new iag;
 	 * agstart field is not updated for logredo() to reconstruct
@@ -442,8 +442,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	}
 
 	/*
-	 *	finalize
-	 *	--------
+	 *      finalize
+	 *      --------
 	 *
 	 * extension is committed when on-disk super block is
 	 * updated with new descriptors: logredo will recover
@@ -480,7 +480,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	diFreeSpecial(ipbmap2);
 
 	/*
-	 *	update superblock
+	 *      update superblock
 	 */
 	if ((rc = readSuper(sb, &bh)))
 		goto error_out;
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 
       resume:
 	/*
-	 *	resume file system transactions
+	 *      resume file system transactions
 	 */
 	txResume(sb);
 
diff --git a/trunk/fs/jfs/xattr.c b/trunk/fs/jfs/xattr.c
index b2375f0774b7..b753ba216450 100644
--- a/trunk/fs/jfs/xattr.c
+++ b/trunk/fs/jfs/xattr.c
@@ -63,9 +63,9 @@
  *
  *   On-disk:
  *
- *	FEALISTs are stored on disk using blocks allocated by dbAlloc() and
- *	written directly. An EA list may be in-lined in the inode if there is
- *	sufficient room available.
+ *     FEALISTs are stored on disk using blocks allocated by dbAlloc() and
+ *     written directly. An EA list may be in-lined in the inode if there is
+ *     sufficient room available.
  */
 
 struct ea_buffer {
@@ -590,8 +590,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
       size_check:
 	if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
 		printk(KERN_ERR "ea_get: invalid extended attribute\n");
-		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
-				     ea_buf->xattr, ea_size, 1);
+		dump_mem("xattr", ea_buf->xattr, ea_size);
 		ea_release(inode, ea_buf);
 		rc = -EIO;
 		goto clean_up;
diff --git a/trunk/fs/proc/array.c b/trunk/fs/proc/array.c
index 98e78e2f18d6..74f30e0c0381 100644
--- a/trunk/fs/proc/array.c
+++ b/trunk/fs/proc/array.c
@@ -165,6 +165,7 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 	rcu_read_lock();
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
+		"SleepAVG:\t%lu%%\n"
 		"Tgid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
@@ -172,6 +173,7 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
+		(p->sleep_avg/1024)*100/(1020000000/1024),
 	       	p->tgid, p->pid,
 	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
@@ -310,41 +312,6 @@ int proc_pid_status(struct task_struct *task, char * buffer)
 	return buffer - orig;
 }
 
-static clock_t task_utime(struct task_struct *p)
-{
-	clock_t utime = cputime_to_clock_t(p->utime),
-		total = utime + cputime_to_clock_t(p->stime);
-	u64 temp;
-
-	/*
-	 * Use CFS's precise accounting:
-	 */
-	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
-
-	if (total) {
-		temp *= utime;
-		do_div(temp, total);
-	}
-	utime = (clock_t)temp;
-
-	return utime;
-}
-
-static clock_t task_stime(struct task_struct *p)
-{
-	clock_t stime = cputime_to_clock_t(p->stime);
-
-	/*
-	 * Use CFS's precise accounting. (we subtract utime from
-	 * the total, to make sure the total observed by userspace
-	 * grows monotonically - apps rely on that):
-	 */
-	stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p);
-
-	return stime;
-}
-
-
 static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 {
 	unsigned long vsize, eip, esp, wchan = ~0UL;
@@ -359,8 +326,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 	unsigned long long start_time;
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
 	unsigned long  min_flt = 0,  maj_flt = 0;
-	cputime_t cutime, cstime;
-	clock_t utime, stime;
+	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
@@ -378,8 +344,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 
 	sigemptyset(&sigign);
 	sigemptyset(&sigcatch);
-	cutime = cstime = cputime_zero;
-	utime = stime = 0;
+	cutime = cstime = utime = stime = cputime_zero;
 
 	rcu_read_lock();
 	if (lock_task_sighand(task, &flags)) {
@@ -405,15 +370,15 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				utime += task_utime(t);
-				stime += task_stime(t);
+				utime = cputime_add(utime, t->utime);
+				stime = cputime_add(stime, t->stime);
 				t = next_thread(t);
 			} while (t != task);
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			utime += cputime_to_clock_t(sig->utime);
-			stime += cputime_to_clock_t(sig->stime);
+			utime = cputime_add(utime, sig->utime);
+			stime = cputime_add(stime, sig->stime);
 		}
 
 		sid = signal_session(sig);
@@ -429,8 +394,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 	if (!whole) {
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
-		utime = task_utime(task);
-		stime = task_stime(task);
+		utime = task->utime;
+		stime = task->stime;
 	}
 
 	/* scale priority and nice values from timeslices to -20..20 */
@@ -461,8 +426,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		cmin_flt,
 		maj_flt,
 		cmaj_flt,
-		utime,
-		stime,
+		cputime_to_clock_t(utime),
+		cputime_to_clock_t(stime),
 		cputime_to_clock_t(cutime),
 		cputime_to_clock_t(cstime),
 		priority,
diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c
index 46ea5d56e1bb..a5fa1fdafc4e 100644
--- a/trunk/fs/proc/base.c
+++ b/trunk/fs/proc/base.c
@@ -296,7 +296,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
  */
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
-	return sprintf(buffer, "%llu %llu %lu\n",
+	return sprintf(buffer, "%lu %lu %lu\n",
 			task->sched_info.cpu_time,
 			task->sched_info.run_delay,
 			task->sched_info.pcnt);
@@ -929,69 +929,6 @@ static const struct file_operations proc_fault_inject_operations = {
 };
 #endif
 
-#ifdef CONFIG_SCHED_DEBUG
-/*
- * Print out various scheduling related per-task fields:
- */
-static int sched_show(struct seq_file *m, void *v)
-{
-	struct inode *inode = m->private;
-	struct task_struct *p;
-
-	WARN_ON(!inode);
-
-	p = get_proc_task(inode);
-	if (!p)
-		return -ESRCH;
-	proc_sched_show_task(p, m);
-
-	put_task_struct(p);
-
-	return 0;
-}
-
-static ssize_t
-sched_write(struct file *file, const char __user *buf,
-	    size_t count, loff_t *offset)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct task_struct *p;
-
-	WARN_ON(!inode);
-
-	p = get_proc_task(inode);
-	if (!p)
-		return -ESRCH;
-	proc_sched_set_task(p);
-
-	put_task_struct(p);
-
-	return count;
-}
-
-static int sched_open(struct inode *inode, struct file *filp)
-{
-	int ret;
-
-	ret = single_open(filp, sched_show, NULL);
-	if (!ret) {
-		struct seq_file *m = filp->private_data;
-
-		m->private = inode;
-	}
-	return ret;
-}
-
-static const struct file_operations proc_pid_sched_operations = {
-	.open		= sched_open,
-	.read		= seq_read,
-	.write		= sched_write,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-#endif
-
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -2026,9 +1963,6 @@ static const struct pid_entry tgid_base_stuff[] = {
 	INF("environ",    S_IRUSR, pid_environ),
 	INF("auxv",       S_IRUSR, pid_auxv),
 	INF("status",     S_IRUGO, pid_status),
-#ifdef CONFIG_SCHED_DEBUG
-	REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
-#endif
 	INF("cmdline",    S_IRUGO, pid_cmdline),
 	INF("stat",       S_IRUGO, tgid_stat),
 	INF("statm",      S_IRUGO, pid_statm),
@@ -2313,9 +2247,6 @@ static const struct pid_entry tid_base_stuff[] = {
 	INF("environ",   S_IRUSR, pid_environ),
 	INF("auxv",      S_IRUSR, pid_auxv),
 	INF("status",    S_IRUGO, pid_status),
-#ifdef CONFIG_SCHED_DEBUG
-	REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
-#endif
 	INF("cmdline",   S_IRUGO, pid_cmdline),
 	INF("stat",      S_IRUGO, tid_stat),
 	INF("statm",     S_IRUGO, pid_statm),
diff --git a/trunk/include/asm-generic/bitops/sched.h b/trunk/include/asm-generic/bitops/sched.h
index 604fab7031a6..815bb0148060 100644
--- a/trunk/include/asm-generic/bitops/sched.h
+++ b/trunk/include/asm-generic/bitops/sched.h
@@ -6,23 +6,28 @@
 
 /*
  * Every architecture must define this function. It's the fastest
- * way of searching a 100-bit bitmap.  It's guaranteed that at least
- * one of the 100 bits is cleared.
+ * way of searching a 140-bit bitmap where the first 100 bits are
+ * unlikely to be set. It's guaranteed that at least one of the 140
+ * bits is cleared.
  */
 static inline int sched_find_first_bit(const unsigned long *b)
 {
 #if BITS_PER_LONG == 64
-	if (b[0])
+	if (unlikely(b[0]))
 		return __ffs(b[0]);
-	return __ffs(b[1]) + 64;
+	if (likely(b[1]))
+		return __ffs(b[1]) + 64;
+	return __ffs(b[2]) + 128;
 #elif BITS_PER_LONG == 32
-	if (b[0])
+	if (unlikely(b[0]))
 		return __ffs(b[0]);
-	if (b[1])
+	if (unlikely(b[1]))
 		return __ffs(b[1]) + 32;
-	if (b[2])
+	if (unlikely(b[2]))
 		return __ffs(b[2]) + 64;
-	return __ffs(b[3]) + 96;
+	if (b[3])
+		return __ffs(b[3]) + 96;
+	return __ffs(b[4]) + 128;
 #else
 #error BITS_PER_LONG not defined
 #endif
diff --git a/trunk/include/asm-mips/mach-au1x00/au1xxx_ide.h b/trunk/include/asm-mips/mach-au1x00/au1xxx_ide.h
index 4663e8b415c9..8fcae21adbd5 100644
--- a/trunk/include/asm-mips/mach-au1x00/au1xxx_ide.h
+++ b/trunk/include/asm-mips/mach-au1x00/au1xxx_ide.h
@@ -88,26 +88,26 @@ static const struct drive_list_entry dma_white_list [] = {
 /*
  * Hitachi
  */
-        { "HITACHI_DK14FA-20"    ,       NULL            },
-        { "HTS726060M9AT00"      ,       NULL            },
+        { "HITACHI_DK14FA-20"    ,       "ALL"           },
+        { "HTS726060M9AT00"      ,       "ALL"           },
 /*
  * Maxtor
  */
-        { "Maxtor 6E040L0"      ,       NULL            },
-        { "Maxtor 6Y080P0"      ,       NULL            },
-        { "Maxtor 6Y160P0"      ,       NULL            },
+        { "Maxtor 6E040L0"      ,       "ALL"           },
+        { "Maxtor 6Y080P0"      ,       "ALL"           },
+        { "Maxtor 6Y160P0"      ,       "ALL"           },
 /*
  * Seagate
  */
-        { "ST3120026A"          ,       NULL            },
-        { "ST320014A"           ,       NULL            },
-        { "ST94011A"            ,       NULL            },
-        { "ST340016A"           ,       NULL            },
+        { "ST3120026A"          ,       "ALL"           },
+        { "ST320014A"           ,       "ALL"           },
+        { "ST94011A"            ,       "ALL"           },
+        { "ST340016A"           ,       "ALL"           },
 /*
  * Western Digital
  */
-        { "WDC WD400UE-00HCT0"  ,       NULL            },
-        { "WDC WD400JB-00JJC0"  ,       NULL            },
+        { "WDC WD400UE-00HCT0"  ,       "ALL"           },
+        { "WDC WD400JB-00JJC0"  ,       "ALL"           },
         { NULL                  ,       NULL            }
 };
 
@@ -116,9 +116,9 @@ static const struct drive_list_entry dma_black_list [] = {
 /*
  * Western Digital
  */
-        { "WDC WD100EB-00CGH0"  ,       NULL            },
-        { "WDC WD200BB-00AUA1"  ,       NULL            },
-        { "WDC AC24300L"        ,       NULL            },
+        { "WDC WD100EB-00CGH0"  ,       "ALL"           },
+        { "WDC WD200BB-00AUA1"  ,       "ALL"           },
+        { "WDC AC24300L"        ,       "ALL"           },
         { NULL                  ,       NULL            }
 };
 #endif
diff --git a/trunk/include/linux/eeprom_93cx6.h b/trunk/include/linux/eeprom_93cx6.h
new file mode 100644
index 000000000000..d774b7778c91
--- /dev/null
+++ b/trunk/include/linux/eeprom_93cx6.h
@@ -0,0 +1,72 @@
+/*
+	Copyright (C) 2004 - 2006 rt2x00 SourceForge Project
+	<http://rt2x00.serialmonkey.com>
+
+	This program is free software; you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation; either version 2 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program; if not, write to the
+	Free Software Foundation, Inc.,
+	59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+/*
+	Module: eeprom_93cx6
+	Abstract: EEPROM reader datastructures for 93cx6 chipsets.
+	Supported chipsets: 93c46 & 93c66.
+ */
+
+/*
+ * EEPROM operation defines.
+ */
+#define PCI_EEPROM_WIDTH_93C46	6
+#define PCI_EEPROM_WIDTH_93C66	8
+#define PCI_EEPROM_WIDTH_OPCODE	3
+#define PCI_EEPROM_WRITE_OPCODE	0x05
+#define PCI_EEPROM_READ_OPCODE	0x06
+#define PCI_EEPROM_EWDS_OPCODE	0x10
+#define PCI_EEPROM_EWEN_OPCODE	0x13
+
+/**
+ * struct eeprom_93cx6 - control structure for setting the commands
+ * for reading the eeprom data.
+ * @data: private pointer for the driver.
+ * @register_read(struct eeprom_93cx6 *eeprom): handler to
+ * read the eeprom register, this function should set all reg_* fields.
+ * @register_write(struct eeprom_93cx6 *eeprom): handler to
+ * write to the eeprom register by using all reg_* fields.
+ * @width: eeprom width, should be one of the PCI_EEPROM_WIDTH_* defines
+ * @reg_data_in: register field to indicate data input
+ * @reg_data_out: register field to indicate data output
+ * @reg_data_clock: register field to set the data clock
+ * @reg_chip_select: register field to set the chip select
+ *
+ * This structure is used for the communication between the driver
+ * and the eeprom_93cx6 handlers for reading the eeprom.
+ */
+struct eeprom_93cx6 {
+	void *data;
+
+	void (*register_read)(struct eeprom_93cx6 *eeprom);
+	void (*register_write)(struct eeprom_93cx6 *eeprom);
+
+	int width;
+
+	char reg_data_in;
+	char reg_data_out;
+	char reg_data_clock;
+	char reg_chip_select;
+};
+
+extern void eeprom_93cx6_read(struct eeprom_93cx6 *eeprom,
+	const u8 word, u16 *data);
+extern void eeprom_93cx6_multiread(struct eeprom_93cx6 *eeprom,
+	const u8 word, __le16 *data, const u16 words);
diff --git a/trunk/include/linux/hardirq.h b/trunk/include/linux/hardirq.h
index 8d302298a161..7803014f3a11 100644
--- a/trunk/include/linux/hardirq.h
+++ b/trunk/include/linux/hardirq.h
@@ -78,19 +78,6 @@
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
 #endif
 
-#ifdef CONFIG_PREEMPT
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_CHECK_OFFSET 0
-#endif
-
-/*
- * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler)
- */
-#define in_atomic_preempt_off() \
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
-
 #ifdef CONFIG_PREEMPT
 # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
 # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
diff --git a/trunk/include/linux/ide.h b/trunk/include/linux/ide.h
index 19ab25804056..1e365acdd369 100644
--- a/trunk/include/linux/ide.h
+++ b/trunk/include/linux/ide.h
@@ -25,7 +25,6 @@
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/semaphore.h>
-#include <asm/mutex.h>
 
 /******************************************************************************
  * IDE driver configuration options (play with these as desired):
@@ -686,8 +685,6 @@ typedef struct hwif_s {
 	u8 mwdma_mask;
 	u8 swdma_mask;
 
-	u8 cbl;		/* cable type */
-
 	hwif_chipset_t chipset;	/* sub-module for tuning.. */
 
 	struct pci_dev  *pci_dev;	/* for pci chipsets */
@@ -738,8 +735,8 @@ typedef struct hwif_s {
 	void (*ide_dma_clear_irq)(ide_drive_t *drive);
 	void (*dma_host_on)(ide_drive_t *drive);
 	void (*dma_host_off)(ide_drive_t *drive);
-	void (*dma_lost_irq)(ide_drive_t *drive);
-	void (*dma_timeout)(ide_drive_t *drive);
+	int (*ide_dma_lostirq)(ide_drive_t *drive);
+	int (*ide_dma_timeout)(ide_drive_t *drive);
 
 	void (*OUTB)(u8 addr, unsigned long port);
 	void (*OUTBSYNC)(ide_drive_t *drive, u8 addr, unsigned long port);
@@ -794,6 +791,7 @@ typedef struct hwif_s {
 	unsigned	sharing_irq: 1;	/* 1 = sharing irq with another hwif */
 	unsigned	reset      : 1;	/* reset after probe */
 	unsigned	autodma    : 1;	/* auto-attempt using DMA at boot */
+	unsigned	udma_four  : 1;	/* 1=ATA-66 capable, 0=default */
 	unsigned	no_lba48   : 1; /* 1 = cannot do LBA48 */
 	unsigned	no_lba48_dma : 1; /* 1 = cannot do LBA48 DMA */
 	unsigned	auto_poll  : 1; /* supports nop auto-poll */
@@ -865,7 +863,7 @@ typedef struct hwgroup_s {
 
 typedef struct ide_driver_s ide_driver_t;
 
-extern struct mutex ide_setting_mtx;
+extern struct semaphore ide_setting_sem;
 
 int set_io_32bit(ide_drive_t *, int);
 int set_pio_mode(ide_drive_t *, int);
@@ -1306,8 +1304,8 @@ extern int __ide_dma_check(ide_drive_t *);
 extern int ide_dma_setup(ide_drive_t *);
 extern void ide_dma_start(ide_drive_t *);
 extern int __ide_dma_end(ide_drive_t *);
-extern void ide_dma_lost_irq(ide_drive_t *);
-extern void ide_dma_timeout(ide_drive_t *);
+extern int __ide_dma_lostirq(ide_drive_t *);
+extern int __ide_dma_timeout(ide_drive_t *);
 #endif /* CONFIG_BLK_DEV_IDEDMA_PCI */
 
 #else
@@ -1384,11 +1382,11 @@ extern const ide_pio_timings_t ide_pio_timings[6];
 
 
 extern spinlock_t ide_lock;
-extern struct mutex ide_cfg_mtx;
+extern struct semaphore ide_cfg_sem;
 /*
  * Structure locking:
  *
- * ide_cfg_mtx and ide_lock together protect changes to
+ * ide_cfg_sem and ide_lock together protect changes to
  * ide_hwif_t->{next,hwgroup}
  * ide_drive_t->next
  *
diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h
index cfb680585ab8..693f0e6c54d4 100644
--- a/trunk/include/linux/sched.h
+++ b/trunk/include/linux/sched.h
@@ -34,8 +34,6 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
-/* SCHED_ISO: reserved but not implemented yet */
-#define SCHED_IDLE		5
 
 #ifdef __KERNEL__
 
@@ -132,26 +130,6 @@ extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long weighted_cpuload(const int cpu);
 
-struct seq_file;
-struct cfs_rq;
-#ifdef CONFIG_SCHED_DEBUG
-extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
-extern void proc_sched_set_task(struct task_struct *p);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now);
-#else
-static inline void
-proc_sched_show_task(struct task_struct *p, struct seq_file *m)
-{
-}
-static inline void proc_sched_set_task(struct task_struct *p)
-{
-}
-static inline void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
-{
-}
-#endif
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -215,7 +193,6 @@ struct task_struct;
 extern void sched_init(void);
 extern void sched_init_smp(void);
 extern void init_idle(struct task_struct *idle, int cpu);
-extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern cpumask_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
@@ -502,7 +479,7 @@ struct signal_struct {
 	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
 	 * other than jiffies.)
 	 */
-	unsigned long long sum_sched_runtime;
+	unsigned long long sched_time;
 
 	/*
 	 * We don't bother to synchronize most readers of this at all,
@@ -544,6 +521,31 @@ struct signal_struct {
 #define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
 
+
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO	100
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
+
+#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
+#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+#define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
+#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -581,13 +583,13 @@ struct reclaim_state;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 struct sched_info {
 	/* cumulative counters */
-	unsigned long pcnt;	      /* # of times run on this cpu */
-	unsigned long long cpu_time,  /* time spent on the cpu */
-			   run_delay; /* time spent waiting on a runqueue */
+	unsigned long	cpu_time,	/* time spent on the cpu */
+			run_delay,	/* time spent waiting on a runqueue */
+			pcnt;		/* # of timeslices run on this cpu */
 
 	/* timestamps */
-	unsigned long long last_arrival,/* when we last ran on a cpu */
-			   last_queued;	/* when we were last queued to run */
+	unsigned long	last_arrival,	/* when we last ran on a cpu */
+			last_queued;	/* when we were last queued to run */
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
 
@@ -637,24 +639,18 @@ static inline int sched_info_on(void)
 #endif
 }
 
-enum cpu_idle_type {
-	CPU_IDLE,
-	CPU_NOT_IDLE,
-	CPU_NEWLY_IDLE,
-	CPU_MAX_IDLE_TYPES
+enum idle_type
+{
+	SCHED_IDLE,
+	NOT_IDLE,
+	NEWLY_IDLE,
+	MAX_IDLE_TYPES
 };
 
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
-
-/*
- * Increase resolution of nice-level calculations:
- */
-#define SCHED_LOAD_SHIFT	10
-#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
-
-#define SCHED_LOAD_SCALE_FUZZ	(SCHED_LOAD_SCALE >> 5)
+#define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
 
 #ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
@@ -723,14 +719,14 @@ struct sched_domain {
 
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
-	unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
-	unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
-	unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
-	unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
-	unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
-	unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
-	unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
-	unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_cnt[MAX_IDLE_TYPES];
+	unsigned long lb_failed[MAX_IDLE_TYPES];
+	unsigned long lb_balanced[MAX_IDLE_TYPES];
+	unsigned long lb_imbalance[MAX_IDLE_TYPES];
+	unsigned long lb_gained[MAX_IDLE_TYPES];
+	unsigned long lb_hot_gained[MAX_IDLE_TYPES];
+	unsigned long lb_nobusyg[MAX_IDLE_TYPES];
+	unsigned long lb_nobusyq[MAX_IDLE_TYPES];
 
 	/* Active load balancing */
 	unsigned long alb_cnt;
@@ -757,6 +753,12 @@ struct sched_domain {
 extern int partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
 
+/*
+ * Maximum cache size the migration-costs auto-tuning code will
+ * search from:
+ */
+extern unsigned int max_cache_size;
+
 #endif	/* CONFIG_SMP */
 
 
@@ -807,86 +809,14 @@ struct mempolicy;
 struct pipe_inode_info;
 struct uts_namespace;
 
-struct rq;
-struct sched_domain;
-
-struct sched_class {
-	struct sched_class *next;
-
-	void (*enqueue_task) (struct rq *rq, struct task_struct *p,
-			      int wakeup, u64 now);
-	void (*dequeue_task) (struct rq *rq, struct task_struct *p,
-			      int sleep, u64 now);
-	void (*yield_task) (struct rq *rq, struct task_struct *p);
-
-	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
-
-	struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
-	void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
-
-	int (*load_balance) (struct rq *this_rq, int this_cpu,
-			struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *total_load_moved);
-
-	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p);
-	void (*task_new) (struct rq *rq, struct task_struct *p);
+enum sleep_type {
+	SLEEP_NORMAL,
+	SLEEP_NONINTERACTIVE,
+	SLEEP_INTERACTIVE,
+	SLEEP_INTERRUPTED,
 };
 
-struct load_weight {
-	unsigned long weight, inv_weight;
-};
-
-/*
- * CFS stats for a schedulable entity (task, task-group etc)
- *
- * Current field usage histogram:
- *
- *     4 se->block_start
- *     4 se->run_node
- *     4 se->sleep_start
- *     4 se->sleep_start_fair
- *     6 se->load.weight
- *     7 se->delta_fair
- *    15 se->wait_runtime
- */
-struct sched_entity {
-	long			wait_runtime;
-	unsigned long		delta_fair_run;
-	unsigned long		delta_fair_sleep;
-	unsigned long		delta_exec;
-	s64			fair_key;
-	struct load_weight	load;		/* for load-balancing */
-	struct rb_node		run_node;
-	unsigned int		on_rq;
-
-	u64			wait_start_fair;
-	u64			wait_start;
-	u64			exec_start;
-	u64			sleep_start;
-	u64			sleep_start_fair;
-	u64			block_start;
-	u64			sleep_max;
-	u64			block_max;
-	u64			exec_max;
-	u64			wait_max;
-	u64			last_ran;
-
-	u64			sum_exec_runtime;
-	s64			sum_wait_runtime;
-	s64			sum_sleep_runtime;
-	unsigned long		wait_runtime_overruns;
-	unsigned long		wait_runtime_underruns;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	struct sched_entity	*parent;
-	/* rq on which this entity is (to be) queued: */
-	struct cfs_rq		*cfs_rq;
-	/* rq "owned" by this entity/group: */
-	struct cfs_rq		*my_q;
-#endif
-};
+struct prio_array;
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
@@ -902,20 +832,23 @@ struct task_struct {
 	int oncpu;
 #endif
 #endif
-
+	int load_weight;	/* for niceness load balancing purposes */
 	int prio, static_prio, normal_prio;
 	struct list_head run_list;
-	struct sched_class *sched_class;
-	struct sched_entity se;
+	struct prio_array *array;
 
 	unsigned short ioprio;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
+	unsigned long sleep_avg;
+	unsigned long long timestamp, last_ran;
+	unsigned long long sched_time; /* sched_clock time spent running */
+	enum sleep_type sleep_type;
 
 	unsigned int policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice;
+	unsigned int time_slice, first_time_slice;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
@@ -1145,37 +1078,6 @@ struct task_struct {
 #endif
 };
 
-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space.  This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-
-#define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
-
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
-
-static inline int rt_prio(int prio)
-{
-	if (unlikely(prio < MAX_RT_PRIO))
-		return 1;
-	return 0;
-}
-
-static inline int rt_task(struct task_struct *p)
-{
-	return rt_prio(p->prio);
-}
-
 static inline pid_t process_group(struct task_struct *tsk)
 {
 	return tsk->signal->pgrp;
@@ -1321,7 +1223,7 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 
 extern unsigned long long sched_clock(void);
 extern unsigned long long
-task_sched_runtime(struct task_struct *task);
+current_sched_time(const struct task_struct *current_task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
@@ -1330,8 +1232,6 @@ extern void sched_exec(void);
 #define sched_exec()   {}
 #endif
 
-extern void sched_clock_unstable_event(void);
-
 #ifdef CONFIG_HOTPLUG_CPU
 extern void idle_task_exit(void);
 #else
@@ -1340,14 +1240,6 @@ static inline void idle_task_exit(void) {}
 
 extern void sched_idle_next(void);
 
-extern unsigned int sysctl_sched_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_batch_wakeup_granularity;
-extern unsigned int sysctl_sched_stat_granularity;
-extern unsigned int sysctl_sched_runtime_limit;
-extern unsigned int sysctl_sched_child_runs_first;
-extern unsigned int sysctl_sched_features;
-
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1425,8 +1317,8 @@ extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 #else
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
-extern void sched_fork(struct task_struct *p, int clone_flags);
-extern void sched_dead(struct task_struct *p);
+extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
+extern void FASTCALL(sched_exit(struct task_struct * p));
 
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
@@ -1514,7 +1406,7 @@ extern struct mm_struct * mm_alloc(void);
 extern void FASTCALL(__mmdrop(struct mm_struct *));
 static inline void mmdrop(struct mm_struct * mm)
 {
-	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
+	if (atomic_dec_and_test(&mm->mm_count))
 		__mmdrop(mm);
 }
 
@@ -1746,7 +1638,10 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 	return task_thread_info(p)->cpu;
 }
 
-extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+	task_thread_info(p)->cpu = cpu;
+}
 
 #else
 
diff --git a/trunk/include/linux/topology.h b/trunk/include/linux/topology.h
index da6c39b2d051..a9d1f049cc15 100644
--- a/trunk/include/linux/topology.h
+++ b/trunk/include/linux/topology.h
@@ -98,7 +98,7 @@
 	.cache_nice_tries	= 0,			\
 	.busy_idx		= 0,			\
 	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.newidle_idx		= 1,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -128,15 +128,14 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\
@@ -159,15 +158,14 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| BALANCE_FOR_PKG_POWER,\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/trunk/include/linux/wait.h b/trunk/include/linux/wait.h
index 0e686280450b..e820d00e1383 100644
--- a/trunk/include/linux/wait.h
+++ b/trunk/include/linux/wait.h
@@ -366,15 +366,15 @@ static inline void remove_wait_queue_locked(wait_queue_head_t *q,
 
 /*
  * These are the old interfaces to sleep waiting for an event.
- * They are racy.  DO NOT use them, use the wait_event* interfaces above.
- * We plan to remove these interfaces.
+ * They are racy.  DO NOT use them, use the wait_event* interfaces above.  
+ * We plan to remove these interfaces during 2.7.
  */
-extern void sleep_on(wait_queue_head_t *q);
-extern long sleep_on_timeout(wait_queue_head_t *q,
-				      signed long timeout);
-extern void interruptible_sleep_on(wait_queue_head_t *q);
-extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
-					   signed long timeout);
+extern void FASTCALL(sleep_on(wait_queue_head_t *q));
+extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
+				      signed long timeout));
+extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
+extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
+						    signed long timeout));
 
 /*
  * Waitqueues which are removed from the waitqueue_head at wakeup time
diff --git a/trunk/init/main.c b/trunk/init/main.c
index 0eb1c7463fe4..eb8bdbae4fc7 100644
--- a/trunk/init/main.c
+++ b/trunk/init/main.c
@@ -436,16 +436,15 @@ static void noinline __init_refok rest_init(void)
 
 	/*
 	 * The boot idle thread must execute schedule()
-	 * at least once to get things moving:
+	 * at least one to get things moving:
 	 */
-	init_idle_bootup_task(current);
 	preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
 
 	/* Call into cpu_idle with preempt disabled */
 	cpu_idle();
-}
+} 
 
 /* Check for early params. */
 static int __init do_early_param(char *param, char *val)
diff --git a/trunk/kernel/delayacct.c b/trunk/kernel/delayacct.c
index 81e697829633..c0148ae992c4 100644
--- a/trunk/kernel/delayacct.c
+++ b/trunk/kernel/delayacct.c
@@ -99,10 +99,9 @@ void __delayacct_blkio_end(void)
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 {
 	s64 tmp;
-	unsigned long t1;
-	unsigned long long t2, t3;
-	unsigned long flags;
 	struct timespec ts;
+	unsigned long t1,t2,t3;
+	unsigned long flags;
 
 	/* Though tsk->delays accessed later, early exit avoids
 	 * unnecessary returning of other data
@@ -125,10 +124,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 
 	d->cpu_count += t1;
 
-	tmp = (s64)d->cpu_delay_total + t2;
+	jiffies_to_timespec(t2, &ts);
+	tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
 	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
 
-	tmp = (s64)d->cpu_run_virtual_total + t3;
+	tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
 	d->cpu_run_virtual_total =
 		(tmp < (s64)d->cpu_run_virtual_total) ?	0 : tmp;
 
diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c
index ca6a11b73023..5c8ecbaa19a5 100644
--- a/trunk/kernel/exit.c
+++ b/trunk/kernel/exit.c
@@ -122,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
+		sig->sched_time += tsk->sched_time;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
-		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
 
@@ -182,6 +182,7 @@ void release_task(struct task_struct * p)
 		zap_leader = (leader->exit_signal == -1);
 	}
 
+	sched_exit(p);
 	write_unlock_irq(&tasklist_lock);
 	proc_flush_task(p);
 	release_thread(p);
@@ -290,7 +291,7 @@ static void reparent_to_kthreadd(void)
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
 
-	if (task_nice(current) < 0)
+	if (!has_rt_policy(current) && (task_nice(current) < 0))
 		set_user_nice(current, 0);
 	/* cpus_allowed? */
 	/* rt_priority? */
diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c
index da3a155bba0d..73ad5cda1bcd 100644
--- a/trunk/kernel/fork.c
+++ b/trunk/kernel/fork.c
@@ -877,7 +877,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-	sig->sum_sched_runtime = 0;
+	sig->sched_time = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
@@ -1040,7 +1040,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->utime = cputime_zero;
 	p->stime = cputime_zero;
-
+ 	p->sched_time = 0;
 #ifdef CONFIG_TASK_XACCT
 	p->rchar = 0;		/* I/O counter: bytes read */
 	p->wchar = 0;		/* I/O counter: bytes written */
diff --git a/trunk/kernel/posix-cpu-timers.c b/trunk/kernel/posix-cpu-timers.c
index b53c8fcd9d82..1de710e18373 100644
--- a/trunk/kernel/posix-cpu-timers.c
+++ b/trunk/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p)
 }
 static inline unsigned long long sched_ns(struct task_struct *p)
 {
-	return task_sched_runtime(p);
+	return (p == current) ? current_sched_time(p) : p->sched_time;
 }
 
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
 		} while (t != p);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = p->signal->sum_sched_runtime;
+		cpu->sched = p->signal->sched_time;
 		/* Add in each other live thread.  */
 		while ((t = next_thread(t)) != p) {
-			cpu->sched += t->se.sum_exec_runtime;
+			cpu->sched += t->sched_time;
 		}
 		cpu->sched += sched_ns(p);
 		break;
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
  */
 static void cleanup_timers(struct list_head *head,
 			   cputime_t utime, cputime_t stime,
-			   unsigned long long sum_exec_runtime)
+			   unsigned long long sched_time)
 {
 	struct cpu_timer_list *timer, *next;
 	cputime_t ptime = cputime_add(utime, stime);
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head,
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
 		list_del_init(&timer->entry);
-		if (timer->expires.sched < sum_exec_runtime) {
+		if (timer->expires.sched < sched_time) {
 			timer->expires.sched = 0;
 		} else {
-			timer->expires.sched -= sum_exec_runtime;
+			timer->expires.sched -= sched_time;
 		}
 	}
 }
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head,
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
 	cleanup_timers(tsk->cpu_timers,
-		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+		       tsk->utime, tsk->stime, tsk->sched_time);
 
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 	cleanup_timers(tsk->signal->cpu_timers,
 		       cputime_add(tsk->utime, tsk->signal->utime),
 		       cputime_add(tsk->stime, tsk->signal->stime),
-		     tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
+		       tsk->sched_time + tsk->signal->sched_time);
 }
 
 
@@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		nsleft = max_t(unsigned long long, nsleft, 1);
 		do {
 			if (likely(!(t->flags & PF_EXITING))) {
-				ns = t->se.sum_exec_runtime + nsleft;
+				ns = t->sched_time + nsleft;
 				if (t->it_sched_expires == 0 ||
 				    t->it_sched_expires > ns) {
 					t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk,
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
+		if (!--maxfire || tsk->sched_time < t->expires.sched) {
 			tsk->it_sched_expires = t->expires.sched;
 			break;
 		}
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk,
 	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
 	cputime_t utime, stime, ptime, virt_expires, prof_expires;
-	unsigned long long sum_sched_runtime, sched_expires;
+	unsigned long long sched_time, sched_expires;
 	struct task_struct *t;
 	struct list_head *timers = sig->cpu_timers;
 
@@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk,
 	 */
 	utime = sig->utime;
 	stime = sig->stime;
-	sum_sched_runtime = sig->sum_sched_runtime;
+	sched_time = sig->sched_time;
 	t = tsk;
 	do {
 		utime = cputime_add(utime, t->utime);
 		stime = cputime_add(stime, t->stime);
-		sum_sched_runtime += t->se.sum_exec_runtime;
+		sched_time += t->sched_time;
 		t = next_thread(t);
 	} while (t != tsk);
 	ptime = cputime_add(utime, stime);
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk,
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || sum_sched_runtime < t->expires.sched) {
+		if (!--maxfire || sched_time < t->expires.sched) {
 			sched_expires = t->expires.sched;
 			break;
 		}
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk,
 		virt_left = cputime_sub(virt_expires, utime);
 		virt_left = cputime_div_non_zero(virt_left, nthreads);
 		if (sched_expires) {
-			sched_left = sched_expires - sum_sched_runtime;
+			sched_left = sched_expires - sched_time;
 			do_div(sched_left, nthreads);
 			sched_left = max_t(unsigned long long, sched_left, 1);
 		} else {
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk,
 				t->it_virt_expires = ticks;
 			}
 
-			sched = t->se.sum_exec_runtime + sched_left;
+			sched = t->sched_time + sched_left;
 			if (sched_expires && (t->it_sched_expires == 0 ||
 					      t->it_sched_expires > sched)) {
 				t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
 	    (tsk->it_sched_expires == 0 ||
-	     tsk->se.sum_exec_runtime < tsk->it_sched_expires))
+	     tsk->sched_time < tsk->it_sched_expires))
 		return;
 
 #undef	UNEXPIRED
diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c
index 9fbced64bfee..50e1a3122699 100644
--- a/trunk/kernel/sched.c
+++ b/trunk/kernel/sched.c
@@ -16,19 +16,13 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
- *  2007-04-15  Work begun on replacing all interactivity tuning with a
- *              fair scheduling design by Con Kolivas.
- *  2007-05-05  Load balancing (smp-nice) and other improvements
- *              by Peter Williams
- *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
- *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  */
 
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
-#include <linux/uaccess.h>
+#include <asm/uaccess.h>
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
@@ -59,9 +53,9 @@
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/reciprocal_div.h>
-#include <linux/unistd.h>
 
 #include <asm/tlb.h>
+#include <asm/unistd.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -97,9 +91,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 
-#define NICE_0_LOAD		SCHED_LOAD_SCALE
-#define NICE_0_SHIFT		SCHED_LOAD_SHIFT
-
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
@@ -109,6 +100,87 @@ unsigned long long __attribute__((weak)) sched_clock(void)
  */
 #define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
 #define DEF_TIMESLICE		(100 * HZ / 1000)
+#define ON_RUNQUEUE_WEIGHT	 30
+#define CHILD_PENALTY		 95
+#define PARENT_PENALTY		100
+#define EXIT_WEIGHT		  3
+#define PRIO_BONUS_RATIO	 25
+#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
+#define INTERACTIVE_DELTA	  2
+#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
+#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
+#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+
+/*
+ * If a task is 'interactive' then we reinsert it in the active
+ * array after it has expired its current timeslice. (it will not
+ * continue to run immediately, it will still roundrobin with
+ * other interactive tasks.)
+ *
+ * This part scales the interactivity limit depending on niceness.
+ *
+ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+ * Here are a few examples of different nice levels:
+ *
+ *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+ *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+ *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+ *
+ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+ *  priority range a task can explore, a value of '1' means the
+ *  task is rated interactive.)
+ *
+ * Ie. nice +19 tasks can never get 'interactive' enough to be
+ * reinserted into the active array. And only heavily CPU-hog nice -20
+ * tasks will be expired. Default nice 0 tasks are somewhere between,
+ * it takes some effort for them to get interactive, but it's not
+ * too hard.
+ */
+
+#define CURRENT_BONUS(p) \
+	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
+		MAX_SLEEP_AVG)
+
+#define GRANULARITY	(10 * HZ / 1000 ? : 1)
+
+#ifdef CONFIG_SMP
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+			num_online_cpus())
+#else
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
+#endif
+
+#define SCALE(v1,v1_max,v2_max) \
+	(v1) * (v2_max) / (v1_max)
+
+#define DELTA(p) \
+	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+		INTERACTIVE_DELTA)
+
+#define TASK_INTERACTIVE(p) \
+	((p)->prio <= (p)->static_prio - DELTA(p))
+
+#define INTERACTIVE_SLEEP(p) \
+	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
+		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
+
+#define SCALE_PRIO(x, prio) \
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+
+static unsigned int static_prio_timeslice(int static_prio)
+{
+	if (static_prio < NICE_TO_PRIO(0))
+		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
+	else
+		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
+}
 
 #ifdef CONFIG_SMP
 /*
@@ -131,87 +203,28 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 }
 #endif
 
-#define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
 /*
- * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+ * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
  * to time slice values: [800ms ... 100ms ... 5ms]
+ *
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
  */
-static unsigned int static_prio_timeslice(int static_prio)
-{
-	if (static_prio == NICE_TO_PRIO(19))
-		return 1;
-
-	if (static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
-	else
-		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
-
-static inline int rt_policy(int policy)
-{
-	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
-		return 1;
-	return 0;
-}
 
-static inline int task_has_rt_policy(struct task_struct *p)
+static inline unsigned int task_timeslice(struct task_struct *p)
 {
-	return rt_policy(p->policy);
+	return static_prio_timeslice(p->static_prio);
 }
 
 /*
- * This is the priority-queue data structure of the RT scheduling class:
+ * These are the runqueue data structures:
  */
-struct rt_prio_array {
-	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-	struct list_head queue[MAX_RT_PRIO];
-};
-
-struct load_stat {
-	struct load_weight load;
-	u64 load_update_start, load_update_last;
-	unsigned long delta_fair, delta_exec, delta_stat;
-};
-
-/* CFS-related fields in a runqueue */
-struct cfs_rq {
-	struct load_weight load;
-	unsigned long nr_running;
-
-	s64 fair_clock;
-	u64 exec_clock;
-	s64 wait_runtime;
-	u64 sleeper_bonus;
-	unsigned long wait_runtime_overruns, wait_runtime_underruns;
-
-	struct rb_root tasks_timeline;
-	struct rb_node *rb_leftmost;
-	struct rb_node *rb_load_balance_curr;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	/* 'curr' points to currently running entity on this cfs_rq.
-	 * It is set to NULL otherwise (i.e when none are currently running).
-	 */
-	struct sched_entity *curr;
-	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
-	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
-	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
-	 * (like users, containers etc.)
-	 *
-	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
-	 * list is used during load balance.
-	 */
-	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
-#endif
-};
-
-/* Real-Time classes' related field in a runqueue: */
-struct rt_rq {
-	struct rt_prio_array active;
-	int rt_load_balance_idx;
-	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
+struct prio_array {
+	unsigned int nr_active;
+	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
+	struct list_head queue[MAX_PRIO];
 };
 
 /*
@@ -222,28 +235,22 @@ struct rt_rq {
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct rq {
-	spinlock_t lock;	/* runqueue lock */
+	spinlock_t lock;
 
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
-	#define CPU_LOAD_IDX_MAX 5
-	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+	unsigned long raw_weighted_load;
+#ifdef CONFIG_SMP
+	unsigned long cpu_load[3];
 	unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
 	unsigned char in_nohz_recently;
 #endif
-	struct load_stat ls;	/* capture load from *all* tasks on this cpu */
-	unsigned long nr_load_updates;
-	u64 nr_switches;
-
-	struct cfs_rq cfs;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
 #endif
-	struct rt_rq  rt;
+	unsigned long long nr_switches;
 
 	/*
 	 * This is part of a global counter where only the total sum
@@ -253,18 +260,14 @@ struct rq {
 	 */
 	unsigned long nr_uninterruptible;
 
+	unsigned long expired_timestamp;
+	/* Cached timestamp set by update_cpu_clock() */
+	unsigned long long most_recent_timestamp;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
-
-	u64 clock, prev_clock_raw;
-	s64 clock_max_delta;
-
-	unsigned int clock_warps, clock_overflows;
-	unsigned int clock_unstable_events;
-
-	struct sched_class *load_balance_class;
-
+	struct prio_array *active, *expired, arrays[2];
+	int best_expired_prio;
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
@@ -304,11 +307,6 @@ struct rq {
 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
-{
-	rq->curr->sched_class->check_preempt_curr(rq, p);
-}
-
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -318,52 +316,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-/*
- * Per-runqueue clock, as finegrained as the platform can give us:
- */
-static unsigned long long __rq_clock(struct rq *rq)
-{
-	u64 prev_raw = rq->prev_clock_raw;
-	u64 now = sched_clock();
-	s64 delta = now - prev_raw;
-	u64 clock = rq->clock;
-
-	/*
-	 * Protect against sched_clock() occasionally going backwards:
-	 */
-	if (unlikely(delta < 0)) {
-		clock++;
-		rq->clock_warps++;
-	} else {
-		/*
-		 * Catch too large forward jumps too:
-		 */
-		if (unlikely(delta > 2*TICK_NSEC)) {
-			clock++;
-			rq->clock_overflows++;
-		} else {
-			if (unlikely(delta > rq->clock_max_delta))
-				rq->clock_max_delta = delta;
-			clock += delta;
-		}
-	}
-
-	rq->prev_clock_raw = now;
-	rq->clock = clock;
-
-	return clock;
-}
-
-static inline unsigned long long rq_clock(struct rq *rq)
-{
-	int this_cpu = smp_processor_id();
-
-	if (this_cpu == cpu_of(rq))
-		return __rq_clock(rq);
-
-	return rq->clock;
-}
-
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
@@ -379,18 +331,6 @@ static inline unsigned long long rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Change a task's ->cfs_rq if it moves across CPUs */
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-	p->se.cfs_rq = &task_rq(p)->cfs;
-}
-#else
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-}
-#endif
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
@@ -520,6 +460,134 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 14
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+	int cpu;
+
+	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+	seq_printf(seq, "timestamp %lu\n", jiffies);
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+		struct sched_domain *sd;
+		int dcnt = 0;
+#endif
+
+		/* runqueue-specific stats */
+		seq_printf(seq,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+		    cpu, rq->yld_both_empty,
+		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
+		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+		    rq->ttwu_cnt, rq->ttwu_local,
+		    rq->rq_sched_info.cpu_time,
+		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+
+		seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+		/* domain-specific stats */
+		preempt_disable();
+		for_each_domain(cpu, sd) {
+			enum idle_type itype;
+			char mask_str[NR_CPUS];
+
+			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
+			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
+					itype++) {
+				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+						"%lu",
+				    sd->lb_cnt[itype],
+				    sd->lb_balanced[itype],
+				    sd->lb_failed[itype],
+				    sd->lb_imbalance[itype],
+				    sd->lb_gained[itype],
+				    sd->lb_hot_gained[itype],
+				    sd->lb_nobusyq[itype],
+				    sd->lb_nobusyg[itype]);
+			}
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+			    " %lu %lu %lu\n",
+			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
+			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
+			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
+			    sd->ttwu_move_balance);
+		}
+		preempt_enable();
+#endif
+	}
+	return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+	char *buf = kmalloc(size, GFP_KERNEL);
+	struct seq_file *m;
+	int res;
+
+	if (!buf)
+		return -ENOMEM;
+	res = single_open(file, show_schedstat, NULL);
+	if (!res) {
+		m = file->private_data;
+		m->buf = buf;
+		m->size = size;
+	} else
+		kfree(buf);
+	return res;
+}
+
+const struct file_operations proc_schedstat_operations = {
+	.open    = schedstat_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq) {
+		rq->rq_sched_info.run_delay += delta_jiffies;
+		rq->rq_sched_info.pcnt++;
+	}
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq)
+		rq->rq_sched_info.cpu_time += delta_jiffies;
+}
+# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{}
+# define schedstat_inc(rq, field)	do { } while (0)
+# define schedstat_add(rq, field, amt)	do { } while (0)
+#endif
+
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
@@ -535,172 +603,177 @@ static inline struct rq *this_rq_lock(void)
 	return rq;
 }
 
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
- * CPU frequency is/was unstable - start new by setting prev_clock_raw:
+ * Called when a process is dequeued from the active array and given
+ * the cpu.  We should note that with the exception of interactive
+ * tasks, the expired queue will become the active queue after the active
+ * queue is empty, without explicitly dequeuing and requeuing tasks in the
+ * expired queue.  (Interactive tasks may be requeued directly to the
+ * active queue, thus delaying tasks in the expired queue from running;
+ * see scheduler_tick()).
+ *
+ * This function is only called from sched_info_arrive(), rather than
+ * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * times as it is shuffled about, we're really interested in knowing how
+ * long it was from the *first* time it was queued to the time that it
+ * finally hit a cpu.
  */
-void sched_clock_unstable_event(void)
+static inline void sched_info_dequeued(struct task_struct *t)
 {
-	unsigned long flags;
-	struct rq *rq;
-
-	rq = task_rq_lock(current, &flags);
-	rq->prev_clock_raw = sched_clock();
-	rq->clock_unstable_events++;
-	task_rq_unlock(rq, &flags);
+	t->sched_info.last_queued = 0;
 }
 
 /*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
+ * Called when a task finally hits the cpu.  We can now calculate how
+ * long it was waiting to run.  We also note when it began so that we
+ * can keep stats on how long its timeslice is.
  */
-#ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-#endif
-
-static void resched_task(struct task_struct *p)
+static void sched_info_arrive(struct task_struct *t)
 {
-	int cpu;
-
-	assert_spin_locked(&task_rq(p)->lock);
-
-	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
-		return;
-
-	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+	unsigned long now = jiffies, delta_jiffies = 0;
 
-	cpu = task_cpu(p);
-	if (cpu == smp_processor_id())
-		return;
+	if (t->sched_info.last_queued)
+		delta_jiffies = now - t->sched_info.last_queued;
+	sched_info_dequeued(t);
+	t->sched_info.run_delay += delta_jiffies;
+	t->sched_info.last_arrival = now;
+	t->sched_info.pcnt++;
 
-	/* NEED_RESCHED must be visible before we test polling */
-	smp_mb();
-	if (!tsk_is_polling(p))
-		smp_send_reschedule(cpu);
+	rq_sched_info_arrive(task_rq(t), delta_jiffies);
 }
 
-static void resched_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	if (!spin_trylock_irqsave(&rq->lock, flags))
-		return;
-	resched_task(cpu_curr(cpu));
-	spin_unlock_irqrestore(&rq->lock, flags);
-}
-#else
-static inline void resched_task(struct task_struct *p)
+/*
+ * Called when a process is queued into either the active or expired
+ * array.  The time is noted and later used to determine how long we
+ * had to wait for us to reach the cpu.  Since the expired queue will
+ * become the active queue after active queue is empty, without dequeuing
+ * and requeuing any tasks, we are interested in queuing to either. It
+ * is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued in the same or another array: this can happen in sched_yield(),
+ * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+ * to runqueue.
+ *
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set.  It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(struct task_struct *t)
 {
-	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_need_resched(p);
+	if (unlikely(sched_info_on()))
+		if (!t->sched_info.last_queued)
+			t->sched_info.last_queued = jiffies;
 }
-#endif
 
-static u64 div64_likely32(u64 divident, unsigned long divisor)
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ */
+static inline void sched_info_depart(struct task_struct *t)
 {
-#if BITS_PER_LONG == 32
-	if (likely(divident <= 0xffffffffULL))
-		return (u32)divident / divisor;
-	do_div(divident, divisor);
+	unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
 
-	return divident;
-#else
-	return divident / divisor;
-#endif
+	t->sched_info.cpu_time += delta_jiffies;
+	rq_sched_info_depart(task_rq(t), delta_jiffies);
 }
 
-#if BITS_PER_LONG == 32
-# define WMULT_CONST	(~0UL)
-#else
-# define WMULT_CONST	(1UL << 32)
-#endif
-
-#define WMULT_SHIFT	32
-
-static inline unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-		struct load_weight *lw)
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice.  (This may also be called when switching to or from
+ * the idle task.)  We are only called when prev != next.
+ */
+static inline void
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
-	u64 tmp;
-
-	if (unlikely(!lw->inv_weight))
-		lw->inv_weight = WMULT_CONST / lw->weight;
+	struct rq *rq = task_rq(prev);
 
-	tmp = (u64)delta_exec * weight;
 	/*
-	 * Check whether we'd overflow the 64-bit multiplication:
+	 * prev now departs the cpu.  It's not interesting to record
+	 * stats about how efficient we were at scheduling the idle
+	 * process, however.
 	 */
-	if (unlikely(tmp > WMULT_CONST)) {
-		tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
-				>> (WMULT_SHIFT/2);
-	} else {
-		tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
-	}
+	if (prev != rq->idle)
+		sched_info_depart(prev);
 
-	return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
+	if (next != rq->idle)
+		sched_info_arrive(next);
+}
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+	if (unlikely(sched_info_on()))
+		__sched_info_switch(prev, next);
 }
+#else
+#define sched_info_queued(t)		do { } while (0)
+#define sched_info_switch(t, next)	do { } while (0)
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
-static inline unsigned long
-calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void dequeue_task(struct task_struct *p, struct prio_array *array)
 {
-	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+	array->nr_active--;
+	list_del(&p->run_list);
+	if (list_empty(array->queue + p->prio))
+		__clear_bit(p->prio, array->bitmap);
 }
 
-static void update_load_add(struct load_weight *lw, unsigned long inc)
+static void enqueue_task(struct task_struct *p, struct prio_array *array)
 {
-	lw->weight += inc;
-	lw->inv_weight = 0;
+	sched_info_queued(p);
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
 }
 
-static void update_load_sub(struct load_weight *lw, unsigned long dec)
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void requeue_task(struct task_struct *p, struct prio_array *array)
 {
-	lw->weight -= dec;
-	lw->inv_weight = 0;
+	list_move_tail(&p->run_list, array->queue + p->prio);
 }
 
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+static inline void
+enqueue_task_head(struct task_struct *p, struct prio_array *array)
 {
-	if (rq->curr != rq->idle && ls->load.weight) {
-		ls->delta_exec += ls->delta_stat;
-		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
-		ls->delta_stat = 0;
-	}
+	list_add(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
 }
 
 /*
- * Update delta_exec, delta_fair fields for rq.
+ * __normal_prio - return the priority that is based on the static
+ * priority but is modified by bonuses/penalties.
+ *
+ * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * into the -5 ... 0 ... +5 bonus/penalty range.
  *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
+ * We use 25% of the full 0...39 priority range so that:
  *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
  *
- * This function is called /before/ updating rq->ls.load
- * and when switching tasks.
+ * Both properties are important to certain workloads.
  */
-static void update_curr_load(struct rq *rq, u64 now)
+
+static inline int __normal_prio(struct task_struct *p)
 {
-	struct load_stat *ls = &rq->ls;
-	u64 start;
+	int bonus, prio;
 
-	start = ls->load_update_start;
-	ls->load_update_start = now;
-	ls->delta_stat += now - start;
-	/*
-	 * Stagger updates to ls->delta_fair. Very frequent updates
-	 * can be expensive.
-	 */
-	if (ls->delta_stat >= sysctl_sched_stat_granularity)
-		__update_curr_load(rq, ls);
+	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+
+	prio = p->static_prio - bonus;
+	if (prio < MAX_RT_PRIO)
+		prio = MAX_RT_PRIO;
+	if (prio > MAX_PRIO-1)
+		prio = MAX_PRIO-1;
+	return prio;
 }
 
 /*
@@ -718,146 +791,53 @@ static void update_curr_load(struct rq *rq, u64 now)
  * this code will need modification
  */
 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
-#define load_weight(lp) \
+#define LOAD_WEIGHT(lp) \
 	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define PRIO_TO_LOAD_WEIGHT(prio) \
-	load_weight(static_prio_timeslice(prio))
+	LOAD_WEIGHT(static_prio_timeslice(prio))
 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
-	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
-
-#define WEIGHT_IDLEPRIO		2
-#define WMULT_IDLEPRIO		(1 << 31)
-
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage.
- */
-static const int prio_to_weight[40] = {
-/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
-/* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
-/*   0 */  NICE_0_LOAD /* 1024 */,
-/*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
-/*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
-};
+	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
 
-static const u32 prio_to_wmult[40] = {
-	48356,   60446,   75558,   94446,  118058,  147573,
-	184467,  230589,  288233,  360285,  450347,
-	562979,  703746,  879575, 1099582, 1374389,
-	717986, 2147483, 2684354, 3355443, 4194304,
-	244160, 6557201, 8196502, 10250518, 12782640,
-	16025997, 19976592, 24970740, 31350126, 39045157,
-	49367440, 61356675, 76695844, 95443717, 119304647,
-	148102320, 186737708, 238609294, 286331153,
-};
+static void set_load_weight(struct task_struct *p)
+{
+	if (has_rt_policy(p)) {
+#ifdef CONFIG_SMP
+		if (p == task_rq(p)->migration_thread)
+			/*
+			 * The migration thread does the actual balancing.
+			 * Giving its load any weight will skew balancing
+			 * adversely.
+			 */
+			p->load_weight = 0;
+		else
+#endif
+			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+	} else
+		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
 
 static inline void
-inc_load(struct rq *rq, const struct task_struct *p, u64 now)
+inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
-	update_curr_load(rq, now);
-	update_load_add(&rq->ls.load, p->se.load.weight);
+	rq->raw_weighted_load += p->load_weight;
 }
 
 static inline void
-dec_load(struct rq *rq, const struct task_struct *p, u64 now)
+dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
-	update_curr_load(rq, now);
-	update_load_sub(&rq->ls.load, p->se.load.weight);
+	rq->raw_weighted_load -= p->load_weight;
 }
 
-static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
-	inc_load(rq, p, now);
+	inc_raw_weighted_load(rq, p);
 }
 
-static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
-	dec_load(rq, p, now);
-}
-
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
-
-/*
- * runqueue iterator, to support SMP load-balancing between different
- * scheduling classes, without having to expose their internal data
- * structures to the load-balancing proper:
- */
-struct rq_iterator {
-	void *arg;
-	struct task_struct *(*start)(void *);
-	struct task_struct *(*next)(void *);
-};
-
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned, unsigned long *load_moved,
-		      int this_best_prio, int best_prio, int best_prio_seen,
-		      struct rq_iterator *iterator);
-
-#include "sched_stats.h"
-#include "sched_rt.c"
-#include "sched_fair.c"
-#include "sched_idletask.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
-
-#define sched_class_highest (&rt_sched_class)
-
-static void set_load_weight(struct task_struct *p)
-{
-	task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
-	p->se.wait_runtime = 0;
-
-	if (task_has_rt_policy(p)) {
-		p->se.load.weight = prio_to_weight[0] * 2;
-		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
-		return;
-	}
-
-	/*
-	 * SCHED_IDLE tasks get minimal weight:
-	 */
-	if (p->policy == SCHED_IDLE) {
-		p->se.load.weight = WEIGHT_IDLEPRIO;
-		p->se.load.inv_weight = WMULT_IDLEPRIO;
-		return;
-	}
-
-	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
-	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
-}
-
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
-{
-	sched_info_queued(p);
-	p->sched_class->enqueue_task(rq, p, wakeup, now);
-	p->se.on_rq = 1;
-}
-
-static void
-dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
-{
-	p->sched_class->dequeue_task(rq, p, sleep, now);
-	p->se.on_rq = 0;
-}
-
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
-{
-	return p->static_prio;
+	dec_raw_weighted_load(rq, p);
 }
 
 /*
@@ -871,7 +851,7 @@ static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 
-	if (task_has_rt_policy(p))
+	if (has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
@@ -899,47 +879,222 @@ static int effective_prio(struct task_struct *p)
 }
 
 /*
- * activate_task - move a task to the runqueue.
+ * __activate_task - move a task to the runqueue.
  */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void __activate_task(struct task_struct *p, struct rq *rq)
 {
-	u64 now = rq_clock(rq);
+	struct prio_array *target = rq->active;
 
-	if (p->state == TASK_UNINTERRUPTIBLE)
-		rq->nr_uninterruptible--;
+	if (batch_task(p))
+		target = rq->expired;
+	enqueue_task(p, target);
+	inc_nr_running(p, rq);
+}
 
-	enqueue_task(rq, p, wakeup, now);
-	inc_nr_running(p, rq, now);
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
+{
+	enqueue_task_head(p, rq->active);
+	inc_nr_running(p, rq);
 }
 
 /*
- * activate_idle_task - move idle task to the _front_ of runqueue.
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
  */
-static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
+static int recalc_task_prio(struct task_struct *p, unsigned long long now)
 {
-	u64 now = rq_clock(rq);
+	/* Caller must always ensure 'now >= p->timestamp' */
+	unsigned long sleep_time = now - p->timestamp;
 
-	if (p->state == TASK_UNINTERRUPTIBLE)
-		rq->nr_uninterruptible--;
+	if (batch_task(p))
+		sleep_time = 0;
+
+	if (likely(sleep_time > 0)) {
+		/*
+		 * This ceiling is set to the lowest priority that would allow
+		 * a task to be reinserted into the active array on timeslice
+		 * completion.
+		 */
+		unsigned long ceiling = INTERACTIVE_SLEEP(p);
+
+		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+			/*
+			 * Prevents user tasks from achieving best priority
+			 * with one single large enough sleep.
+			 */
+			p->sleep_avg = ceiling;
+			/*
+			 * Using INTERACTIVE_SLEEP() as a ceiling places a
+			 * nice(0) task 1ms sleep away from promotion, and
+			 * gives it 700ms to round-robin with no chance of
+			 * being demoted.  This is more than generous, so
+			 * mark this sleep as non-interactive to prevent the
+			 * on-runqueue bonus logic from intervening should
+			 * this task not receive cpu immediately.
+			 */
+			p->sleep_type = SLEEP_NONINTERACTIVE;
+		} else {
+			/*
+			 * Tasks waking from uninterruptible sleep are
+			 * limited in their sleep_avg rise as they
+			 * are likely to be waiting on I/O
+			 */
+			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
+				if (p->sleep_avg >= ceiling)
+					sleep_time = 0;
+				else if (p->sleep_avg + sleep_time >=
+					 ceiling) {
+						p->sleep_avg = ceiling;
+						sleep_time = 0;
+				}
+			}
+
+			/*
+			 * This code gives a bonus to interactive tasks.
+			 *
+			 * The boost works by updating the 'average sleep time'
+			 * value here, based on ->timestamp. The more time a
+			 * task spends sleeping, the higher the average gets -
+			 * and the higher the priority boost gets as well.
+			 */
+			p->sleep_avg += sleep_time;
+
+		}
+		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+			p->sleep_avg = NS_MAX_SLEEP_AVG;
+	}
 
-	enqueue_task(rq, p, 0, now);
-	inc_nr_running(p, rq, now);
+	return effective_prio(p);
+}
+
+/*
+ * activate_task - move a task to the runqueue and do priority recalculation
+ *
+ * Update all the scheduling statistics stuff. (sleep average
+ * calculation, priority modifiers, etc.)
+ */
+static void activate_task(struct task_struct *p, struct rq *rq, int local)
+{
+	unsigned long long now;
+
+	if (rt_task(p))
+		goto out;
+
+	now = sched_clock();
+#ifdef CONFIG_SMP
+	if (!local) {
+		/* Compensate for drifting sched_clock */
+		struct rq *this_rq = this_rq();
+		now = (now - this_rq->most_recent_timestamp)
+			+ rq->most_recent_timestamp;
+	}
+#endif
+
+	/*
+	 * Sleep time is in units of nanosecs, so shift by 20 to get a
+	 * milliseconds-range estimation of the amount of time that the task
+	 * spent sleeping:
+	 */
+	if (unlikely(prof_on == SLEEP_PROFILING)) {
+		if (p->state == TASK_UNINTERRUPTIBLE)
+			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+				     (now - p->timestamp) >> 20);
+	}
+
+	p->prio = recalc_task_prio(p, now);
+
+	/*
+	 * This checks to make sure it's not an uninterruptible task
+	 * that is now waking up.
+	 */
+	if (p->sleep_type == SLEEP_NORMAL) {
+		/*
+		 * Tasks which were woken up by interrupts (ie. hw events)
+		 * are most likely of interactive nature. So we give them
+		 * the credit of extending their sleep time to the period
+		 * of time they spend on the runqueue, waiting for execution
+		 * on a CPU, first time around:
+		 */
+		if (in_interrupt())
+			p->sleep_type = SLEEP_INTERRUPTED;
+		else {
+			/*
+			 * Normal first-time wakeups get a credit too for
+			 * on-runqueue time, but it will be weighted down:
+			 */
+			p->sleep_type = SLEEP_INTERACTIVE;
+		}
+	}
+	p->timestamp = now;
+out:
+	__activate_task(p, rq);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void deactivate_task(struct task_struct *p, struct rq *rq)
+{
+	dec_nr_running(p, rq);
+	dequeue_task(p, p->array);
+	p->array = NULL;
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+static void resched_task(struct task_struct *p)
 {
-	u64 now = rq_clock(rq);
+	int cpu;
 
-	if (p->state == TASK_UNINTERRUPTIBLE)
-		rq->nr_uninterruptible++;
+	assert_spin_locked(&task_rq(p)->lock);
 
-	dequeue_task(rq, p, sleep, now);
-	dec_nr_running(p, rq, now);
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+		return;
+
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(p))
+		smp_send_reschedule(cpu);
 }
 
+static void resched_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	if (!spin_trylock_irqsave(&rq->lock, flags))
+		return;
+	resched_task(cpu_curr(cpu));
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+#else
+static inline void resched_task(struct task_struct *p)
+{
+	assert_spin_locked(&task_rq(p)->lock);
+	set_tsk_need_resched(p);
+}
+#endif
+
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
@@ -952,42 +1107,10 @@ inline int task_curr(const struct task_struct *p)
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
-	return cpu_rq(cpu)->ls.load.weight;
-}
-
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_SMP
-	task_thread_info(p)->cpu = cpu;
-	set_task_cfs_rq(p);
-#endif
+	return cpu_rq(cpu)->raw_weighted_load;
 }
 
 #ifdef CONFIG_SMP
-
-void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
-{
-	int old_cpu = task_cpu(p);
-	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
-	u64 clock_offset, fair_clock_offset;
-
-	clock_offset = old_rq->clock - new_rq->clock;
-	fair_clock_offset = old_rq->cfs.fair_clock -
-						 new_rq->cfs.fair_clock;
-	if (p->se.wait_start)
-		p->se.wait_start -= clock_offset;
-	if (p->se.wait_start_fair)
-		p->se.wait_start_fair -= fair_clock_offset;
-	if (p->se.sleep_start)
-		p->se.sleep_start -= clock_offset;
-	if (p->se.block_start)
-		p->se.block_start -= clock_offset;
-	if (p->se.sleep_start_fair)
-		p->se.sleep_start_fair -= fair_clock_offset;
-
-	__set_task_cpu(p, new_cpu);
-}
-
 struct migration_req {
 	struct list_head list;
 
@@ -1010,7 +1133,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
-	if (!p->se.on_rq && !task_running(rq, p)) {
+	if (!p->array && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
@@ -1035,8 +1158,9 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 void wait_task_inactive(struct task_struct *p)
 {
 	unsigned long flags;
-	int running, on_rq;
 	struct rq *rq;
+	struct prio_array *array;
+	int running;
 
 repeat:
 	/*
@@ -1068,7 +1192,7 @@ void wait_task_inactive(struct task_struct *p)
 	 */
 	rq = task_rq_lock(p, &flags);
 	running = task_running(rq, p);
-	on_rq = p->se.on_rq;
+	array = p->array;
 	task_rq_unlock(rq, &flags);
 
 	/*
@@ -1091,7 +1215,7 @@ void wait_task_inactive(struct task_struct *p)
 	 * running right now), it's preempted, and we should
 	 * yield - it could be a while.
 	 */
-	if (unlikely(on_rq)) {
+	if (unlikely(array)) {
 		yield();
 		goto repeat;
 	}
@@ -1137,12 +1261,11 @@ void kick_process(struct task_struct *p)
 static inline unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
 
 	if (type == 0)
-		return total;
+		return rq->raw_weighted_load;
 
-	return min(rq->cpu_load[type-1], total);
+	return min(rq->cpu_load[type-1], rq->raw_weighted_load);
 }
 
 /*
@@ -1152,12 +1275,11 @@ static inline unsigned long source_load(int cpu, int type)
 static inline unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
 
 	if (type == 0)
-		return total;
+		return rq->raw_weighted_load;
 
-	return max(rq->cpu_load[type-1], total);
+	return max(rq->cpu_load[type-1], rq->raw_weighted_load);
 }
 
 /*
@@ -1166,10 +1288,9 @@ static inline unsigned long target_load(int cpu, int type)
 static inline unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
 	unsigned long n = rq->nr_running;
 
-	return n ? total / n : SCHED_LOAD_SCALE;
+	return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
 }
 
 /*
@@ -1271,9 +1392,9 @@ static int sched_balance_self(int cpu, int flag)
 	struct sched_domain *tmp, *sd = NULL;
 
 	for_each_domain(cpu, tmp) {
-		/*
-		 * If power savings logic is enabled for a domain, stop there.
-		 */
+ 		/*
+ 	 	 * If power savings logic is enabled for a domain, stop there.
+ 	 	 */
 		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 			break;
 		if (tmp->flags & flag)
@@ -1356,9 +1477,9 @@ static int wake_idle(int cpu, struct task_struct *p)
 				if (idle_cpu(i))
 					return i;
 			}
-		} else {
-			break;
 		}
+		else
+			break;
 	}
 	return cpu;
 }
@@ -1400,7 +1521,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!(old_state & state))
 		goto out;
 
-	if (p->se.on_rq)
+	if (p->array)
 		goto out_running;
 
 	cpu = task_cpu(p);
@@ -1455,11 +1576,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 			 * of the current CPU:
 			 */
 			if (sync)
-				tl -= current->se.load.weight;
+				tl -= current->load_weight;
 
 			if ((tl <= load &&
 				tl + target_load(cpu, idx) <= tl_per_task) ||
-			       100*(tl + p->se.load.weight) <= imbalance*load) {
+				100*(tl + p->load_weight) <= imbalance*load) {
 				/*
 				 * This domain has SD_WAKE_AFFINE and
 				 * p is cache cold in this domain, and
@@ -1493,7 +1614,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
-		if (p->se.on_rq)
+		if (p->array)
 			goto out_running;
 
 		this_cpu = smp_processor_id();
@@ -1502,7 +1623,25 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
-	activate_task(rq, p, 1);
+	if (old_state == TASK_UNINTERRUPTIBLE) {
+		rq->nr_uninterruptible--;
+		/*
+		 * Tasks on involuntary sleep don't earn
+		 * sleep_avg beyond just interactive state.
+		 */
+		p->sleep_type = SLEEP_NONINTERACTIVE;
+	} else
+
+	/*
+	 * Tasks that have marked their sleep as noninteractive get
+	 * woken up with their sleep average not weighted in an
+	 * interactive way.
+	 */
+		if (old_state & TASK_NONINTERACTIVE)
+			p->sleep_type = SLEEP_NONINTERACTIVE;
+
+
+	activate_task(p, rq, cpu == this_cpu);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
@@ -1511,8 +1650,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
-	if (!sync || cpu != this_cpu)
-		check_preempt_curr(rq, p);
+	if (!sync || cpu != this_cpu) {
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
 	success = 1;
 
 out_running:
@@ -1535,36 +1676,19 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
 	return try_to_wake_up(p, state, 0);
 }
 
+static void task_running_tick(struct rq *rq, struct task_struct *p);
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
- *
- * __sched_fork() is basic setup used by init_idle() too:
- */
-static void __sched_fork(struct task_struct *p)
-{
-	p->se.wait_start_fair		= 0;
-	p->se.wait_start		= 0;
-	p->se.exec_start		= 0;
-	p->se.sum_exec_runtime		= 0;
-	p->se.delta_exec		= 0;
-	p->se.delta_fair_run		= 0;
-	p->se.delta_fair_sleep		= 0;
-	p->se.wait_runtime		= 0;
-	p->se.sum_wait_runtime		= 0;
-	p->se.sum_sleep_runtime		= 0;
-	p->se.sleep_start		= 0;
-	p->se.sleep_start_fair		= 0;
-	p->se.block_start		= 0;
-	p->se.sleep_max			= 0;
-	p->se.block_max			= 0;
-	p->se.exec_max			= 0;
-	p->se.wait_max			= 0;
-	p->se.wait_runtime_overruns	= 0;
-	p->se.wait_runtime_underruns	= 0;
+ */
+void fastcall sched_fork(struct task_struct *p, int clone_flags)
+{
+	int cpu = get_cpu();
 
-	INIT_LIST_HEAD(&p->run_list);
-	p->se.on_rq = 0;
+#ifdef CONFIG_SMP
+	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+	set_task_cpu(p, cpu);
 
 	/*
 	 * We mark the process as running here, but have not actually
@@ -1573,29 +1697,16 @@ static void __sched_fork(struct task_struct *p)
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
-}
-
-/*
- * fork()/clone()-time setup:
- */
-void sched_fork(struct task_struct *p, int clone_flags)
-{
-	int cpu = get_cpu();
-
-	__sched_fork(p);
-
-#ifdef CONFIG_SMP
-	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
-#endif
-	__set_task_cpu(p, cpu);
 
 	/*
 	 * Make sure we do not leak PI boosting priority to the child:
 	 */
 	p->prio = current->normal_prio;
 
+	INIT_LIST_HEAD(&p->run_list);
+	p->array = NULL;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-	if (likely(sched_info_on()))
+	if (unlikely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1605,15 +1716,33 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness.
+	 */
+	local_irq_disable();
+	p->time_slice = (current->time_slice + 1) >> 1;
+	/*
+	 * The remainder of the first timeslice might be recovered by
+	 * the parent if the child exits early enough.
+	 */
+	p->first_time_slice = 1;
+	current->time_slice >>= 1;
+	p->timestamp = sched_clock();
+	if (unlikely(!current->time_slice)) {
+		/*
+		 * This case is rare, it happens when the parent has only
+		 * a single jiffy left from its timeslice. Taking the
+		 * runqueue lock is not a problem.
+		 */
+		current->time_slice = 1;
+		task_running_tick(cpu_rq(cpu), current);
+	}
+	local_irq_enable();
 	put_cpu();
 }
 
-/*
- * After fork, child runs first. (default) If set to 0 then
- * parent will (try to) run first.
- */
-unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
-
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
@@ -1623,27 +1752,107 @@ unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
  */
 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
+	struct rq *rq, *this_rq;
 	unsigned long flags;
-	struct rq *rq;
-	int this_cpu;
+	int this_cpu, cpu;
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
-	this_cpu = smp_processor_id(); /* parent's CPU */
+	this_cpu = smp_processor_id();
+	cpu = task_cpu(p);
+
+	/*
+	 * We decrease the sleep average of forking parents
+	 * and children as well, to keep max-interactive tasks
+	 * from forking tasks that are max-interactive. The parent
+	 * (current) is done further down, under its lock.
+	 */
+	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 
 	p->prio = effective_prio(p);
 
-	if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
-			task_cpu(p) != this_cpu || !current->se.on_rq) {
-		activate_task(rq, p, 0);
+	if (likely(cpu == this_cpu)) {
+		if (!(clone_flags & CLONE_VM)) {
+			/*
+			 * The VM isn't cloned, so we're in a good position to
+			 * do child-runs-first in anticipation of an exec. This
+			 * usually avoids a lot of COW overhead.
+			 */
+			if (unlikely(!current->array))
+				__activate_task(p, rq);
+			else {
+				p->prio = current->prio;
+				p->normal_prio = current->normal_prio;
+				list_add_tail(&p->run_list, &current->run_list);
+				p->array = current->array;
+				p->array->nr_active++;
+				inc_nr_running(p, rq);
+			}
+			set_need_resched();
+		} else
+			/* Run child last */
+			__activate_task(p, rq);
+		/*
+		 * We skip the following code due to cpu == this_cpu
+	 	 *
+		 *   task_rq_unlock(rq, &flags);
+		 *   this_rq = task_rq_lock(current, &flags);
+		 */
+		this_rq = rq;
 	} else {
+		this_rq = cpu_rq(this_cpu);
+
+		/*
+		 * Not the local CPU - must adjust timestamp. This should
+		 * get optimised away in the !CONFIG_SMP case.
+		 */
+		p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
+					+ rq->most_recent_timestamp;
+		__activate_task(p, rq);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+
 		/*
-		 * Let the scheduling class do new task startup
-		 * management (if any):
+		 * Parent and child are on different CPUs, now get the
+		 * parent runqueue to update the parent's ->sleep_avg:
 		 */
-		p->sched_class->task_new(rq, p);
+		task_rq_unlock(rq, &flags);
+		this_rq = task_rq_lock(current, &flags);
+	}
+	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+	task_rq_unlock(this_rq, &flags);
+}
+
+/*
+ * Potentially available exiting-child timeslices are
+ * retrieved here - this way the parent does not get
+ * penalized for creating too many threads.
+ *
+ * (this cannot be used to 'generate' timeslices
+ * artificially, because any timeslice recovered here
+ * was given away by the parent in the first place.)
+ */
+void fastcall sched_exit(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+
+	/*
+	 * If the child was a (relative-) CPU hog then decrease
+	 * the sleep_avg of the parent as well.
+	 */
+	rq = task_rq_lock(p->parent, &flags);
+	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
+		p->parent->time_slice += p->time_slice;
+		if (unlikely(p->parent->time_slice > task_timeslice(p)))
+			p->parent->time_slice = task_timeslice(p);
 	}
-	check_preempt_curr(rq, p);
+	if (p->sleep_avg < p->parent->sleep_avg)
+		p->parent->sleep_avg = p->parent->sleep_avg /
+		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
+		(EXIT_WEIGHT + 1);
 	task_rq_unlock(rq, &flags);
 }
 
@@ -1708,7 +1917,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
-		 */
+	 	 */
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
@@ -1736,15 +1945,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
-static inline void
+static inline struct task_struct *
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
-	struct mm_struct *mm, *oldmm;
+	struct mm_struct *mm = next->mm;
+	struct mm_struct *oldmm = prev->active_mm;
 
-	prepare_task_switch(rq, next);
-	mm = next->mm;
-	oldmm = prev->active_mm;
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
@@ -1752,15 +1959,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 */
 	arch_enter_lazy_cpu_mode();
 
-	if (unlikely(!mm)) {
+	if (!mm) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 
-	if (unlikely(!prev->mm)) {
+	if (!prev->mm) {
 		prev->active_mm = NULL;
+		WARN_ON(rq->prev_mm);
 		rq->prev_mm = oldmm;
 	}
 	/*
@@ -1776,13 +1984,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 
-	barrier();
-	/*
-	 * this_rq must be evaluated again because prev may have moved
-	 * CPUs since it called schedule(), thus the 'rq' on its stack
-	 * frame will be invalid.
-	 */
-	finish_task_switch(this_rq(), prev);
+	return prev;
 }
 
 /*
@@ -1855,65 +2057,17 @@ unsigned long nr_active(void)
 	return running + uninterruptible;
 }
 
+#ifdef CONFIG_SMP
+
 /*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
+ * Is this task likely cache-hot:
  */
-static void update_cpu_load(struct rq *this_rq)
+static inline int
+task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
 {
-	u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
-	unsigned long total_load = this_rq->ls.load.weight;
-	unsigned long this_load =  total_load;
-	struct load_stat *ls = &this_rq->ls;
-	u64 now = __rq_clock(this_rq);
-	int i, scale;
-
-	this_rq->nr_load_updates++;
-	if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
-		goto do_avg;
-
-	/* Update delta_fair/delta_exec fields first */
-	update_curr_load(this_rq, now);
-
-	fair_delta64 = ls->delta_fair + 1;
-	ls->delta_fair = 0;
-
-	exec_delta64 = ls->delta_exec + 1;
-	ls->delta_exec = 0;
-
-	sample_interval64 = now - ls->load_update_last;
-	ls->load_update_last = now;
-
-	if ((s64)sample_interval64 < (s64)TICK_NSEC)
-		sample_interval64 = TICK_NSEC;
-
-	if (exec_delta64 > sample_interval64)
-		exec_delta64 = sample_interval64;
-
-	idle_delta64 = sample_interval64 - exec_delta64;
-
-	tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
-	tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
-
-	this_load = (unsigned long)tmp64;
-
-do_avg:
-
-	/* Update our load: */
-	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-		unsigned long old_load, new_load;
-
-		/* scale is effectively 1 << i now, and >> i divides by scale */
-
-		old_load = this_rq->cpu_load[i];
-		new_load = this_load;
-
-		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
-	}
+	return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
 }
 
-#ifdef CONFIG_SMP
-
 /*
  * double_rq_lock - safely lock two runqueues
  *
@@ -2030,17 +2184,23 @@ void sched_exec(void)
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
-		      struct rq *this_rq, int this_cpu)
+static void pull_task(struct rq *src_rq, struct prio_array *src_array,
+		      struct task_struct *p, struct rq *this_rq,
+		      struct prio_array *this_array, int this_cpu)
 {
-	deactivate_task(src_rq, p, 0);
+	dequeue_task(p, src_array);
+	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
-	activate_task(this_rq, p, 0);
+	inc_nr_running(p, this_rq);
+	enqueue_task(p, this_array);
+	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
+				+ this_rq->most_recent_timestamp;
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
-	check_preempt_curr(this_rq, p);
+	if (TASK_PREEMPTS_CURR(p, this_rq))
+		resched_task(this_rq->curr);
 }
 
 /*
@@ -2048,7 +2208,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
  */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-		     struct sched_domain *sd, enum cpu_idle_type idle,
+		     struct sched_domain *sd, enum idle_type idle,
 		     int *all_pinned)
 {
 	/*
@@ -2065,67 +2225,132 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		return 0;
 
 	/*
-	 * Aggressive migration if too many balance attempts have failed:
+	 * Aggressive migration if:
+	 * 1) task is cache cold, or
+	 * 2) too many balance attempts have failed.
 	 */
-	if (sd->nr_balance_failed > sd->cache_nice_tries)
+
+	if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+		if (task_hot(p, rq->most_recent_timestamp, sd))
+			schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
 		return 1;
+	}
 
+	if (task_hot(p, rq->most_recent_timestamp, sd))
+		return 0;
 	return 1;
 }
 
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
+
+/*
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned, unsigned long *load_moved,
-		      int this_best_prio, int best_prio, int best_prio_seen,
-		      struct rq_iterator *iterator)
+		      struct sched_domain *sd, enum idle_type idle,
+		      int *all_pinned)
 {
-	int pulled = 0, pinned = 0, skip_for_load;
-	struct task_struct *p;
-	long rem_load_move = max_load_move;
+	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+	    best_prio_seen, skip_for_load;
+	struct prio_array *array, *dst_array;
+	struct list_head *head, *curr;
+	struct task_struct *tmp;
+	long rem_load_move;
 
 	if (max_nr_move == 0 || max_load_move == 0)
 		goto out;
 
+	rem_load_move = max_load_move;
 	pinned = 1;
+	this_best_prio = rq_best_prio(this_rq);
+	best_prio = rq_best_prio(busiest);
+	/*
+	 * Enable handling of the case where there is more than one task
+	 * with the best priority.   If the current running task is one
+	 * of those with prio==best_prio we know it won't be moved
+	 * and therefore it's safe to override the skip (based on load) of
+	 * any task we find with that prio.
+	 */
+	best_prio_seen = best_prio == busiest->curr->prio;
 
 	/*
-	 * Start the load-balancing iterator:
+	 * We first consider expired tasks. Those will likely not be
+	 * executed in the near future, and they are most likely to
+	 * be cache-cold, thus switching CPUs has the least effect
+	 * on them.
 	 */
-	p = iterator->start(iterator->arg);
-next:
-	if (!p)
+	if (busiest->expired->nr_active) {
+		array = busiest->expired;
+		dst_array = this_rq->expired;
+	} else {
+		array = busiest->active;
+		dst_array = this_rq->active;
+	}
+
+new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
+skip_bitmap:
+	if (!idx)
+		idx = sched_find_first_bit(array->bitmap);
+	else
+		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
+		if (array == busiest->expired && busiest->active->nr_active) {
+			array = busiest->active;
+			dst_array = this_rq->active;
+			goto new_array;
+		}
 		goto out;
+	}
+
+	head = array->queue + idx;
+	curr = head->prev;
+skip_queue:
+	tmp = list_entry(curr, struct task_struct, run_list);
+
+	curr = curr->prev;
+
 	/*
 	 * To help distribute high priority tasks accross CPUs we don't
 	 * skip a task if it will be the highest priority task (i.e. smallest
 	 * prio value) on its new queue regardless of its load weight
 	 */
-	skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
-							 SCHED_LOAD_SCALE_FUZZ;
-	if (skip_for_load && p->prio < this_best_prio)
-		skip_for_load = !best_prio_seen && p->prio == best_prio;
+	skip_for_load = tmp->load_weight > rem_load_move;
+	if (skip_for_load && idx < this_best_prio)
+		skip_for_load = !best_prio_seen && idx == best_prio;
 	if (skip_for_load ||
-	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
 
-		best_prio_seen |= p->prio == best_prio;
-		p = iterator->next(iterator->arg);
-		goto next;
+		best_prio_seen |= idx == best_prio;
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
 	}
 
-	pull_task(busiest, p, this_rq, this_cpu);
+	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
-	rem_load_move -= p->se.load.weight;
+	rem_load_move -= tmp->load_weight;
 
 	/*
 	 * We only want to steal up to the prescribed number of tasks
 	 * and the prescribed amount of weighted load.
 	 */
 	if (pulled < max_nr_move && rem_load_move > 0) {
-		if (p->prio < this_best_prio)
-			this_best_prio = p->prio;
-		p = iterator->next(iterator->arg);
-		goto next;
+		if (idx < this_best_prio)
+			this_best_prio = idx;
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
 	}
 out:
 	/*
@@ -2137,39 +2362,9 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	if (all_pinned)
 		*all_pinned = pinned;
-	*load_moved = max_load_move - rem_load_move;
 	return pulled;
 }
 
-/*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned)
-{
-	struct sched_class *class = sched_class_highest;
-	unsigned long load_moved, total_nr_moved = 0, nr_moved;
-	long rem_load_move = max_load_move;
-
-	do {
-		nr_moved = class->load_balance(this_rq, this_cpu, busiest,
-				max_nr_move, (unsigned long)rem_load_move,
-				sd, idle, all_pinned, &load_moved);
-		total_nr_moved += nr_moved;
-		max_nr_move -= nr_moved;
-		rem_load_move -= load_moved;
-		class = class->next;
-	} while (class && max_nr_move && rem_load_move > 0);
-
-	return total_nr_moved;
-}
-
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the amount of weighted load which
@@ -2177,8 +2372,8 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, cpumask_t *cpus, int *balance)
+		   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+		   cpumask_t *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2196,9 +2391,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
-	if (idle == CPU_NOT_IDLE)
+	if (idle == NOT_IDLE)
 		load_idx = sd->busy_idx;
-	else if (idle == CPU_NEWLY_IDLE)
+	else if (idle == NEWLY_IDLE)
 		load_idx = sd->newidle_idx;
 	else
 		load_idx = sd->idle_idx;
@@ -2242,7 +2437,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 			avg_load += load;
 			sum_nr_running += rq->nr_running;
-			sum_weighted_load += weighted_cpuload(i);
+			sum_weighted_load += rq->raw_weighted_load;
 		}
 
 		/*
@@ -2282,9 +2477,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * Busy processors will not participate in power savings
 		 * balance.
 		 */
-		if (idle == CPU_NOT_IDLE ||
-				!(sd->flags & SD_POWERSAVINGS_BALANCE))
-			goto group_next;
+ 		if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ 			goto group_next;
 
 		/*
 		 * If the local group is idle or completely loaded
@@ -2294,42 +2488,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 				    !this_nr_running))
 			power_savings_balance = 0;
 
-		/*
+ 		/*
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
-		 */
-		if (!power_savings_balance || sum_nr_running >= group_capacity
+ 		 */
+ 		if (!power_savings_balance || sum_nr_running >= group_capacity
 		    || !sum_nr_running)
-			goto group_next;
+ 			goto group_next;
 
-		/*
+ 		/*
 		 * Calculate the group which has the least non-idle load.
-		 * This is the group from where we need to pick up the load
-		 * for saving power
-		 */
-		if ((sum_nr_running < min_nr_running) ||
-		    (sum_nr_running == min_nr_running &&
+ 		 * This is the group from where we need to pick up the load
+ 		 * for saving power
+ 		 */
+ 		if ((sum_nr_running < min_nr_running) ||
+ 		    (sum_nr_running == min_nr_running &&
 		     first_cpu(group->cpumask) <
 		     first_cpu(group_min->cpumask))) {
-			group_min = group;
-			min_nr_running = sum_nr_running;
+ 			group_min = group;
+ 			min_nr_running = sum_nr_running;
 			min_load_per_task = sum_weighted_load /
 						sum_nr_running;
-		}
+ 		}
 
-		/*
+ 		/*
 		 * Calculate the group which is almost near its
-		 * capacity but still has some space to pick up some load
-		 * from other group and save more power
-		 */
-		if (sum_nr_running <= group_capacity - 1) {
-			if (sum_nr_running > leader_nr_running ||
-			    (sum_nr_running == leader_nr_running &&
-			     first_cpu(group->cpumask) >
-			      first_cpu(group_leader->cpumask))) {
-				group_leader = group;
-				leader_nr_running = sum_nr_running;
-			}
+ 		 * capacity but still has some space to pick up some load
+ 		 * from other group and save more power
+ 		 */
+ 		if (sum_nr_running <= group_capacity - 1) {
+ 			if (sum_nr_running > leader_nr_running ||
+ 			    (sum_nr_running == leader_nr_running &&
+ 			     first_cpu(group->cpumask) >
+ 			      first_cpu(group_leader->cpumask))) {
+ 				group_leader = group;
+ 				leader_nr_running = sum_nr_running;
+ 			}
 		}
 group_next:
 #endif
@@ -2384,7 +2578,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
+	if (*imbalance < busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 
@@ -2398,8 +2592,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		} else
 			this_load_per_task = SCHED_LOAD_SCALE;
 
-		if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
-					busiest_load_per_task * imbn) {
+		if (max_load - this_load >= busiest_load_per_task * imbn) {
 			*imbalance = busiest_load_per_task;
 			return busiest;
 		}
@@ -2446,7 +2639,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+	if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 
 	if (this == group_leader && group_leader != group_min) {
@@ -2463,7 +2656,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
+find_busiest_queue(struct sched_group *group, enum idle_type idle,
 		   unsigned long imbalance, cpumask_t *cpus)
 {
 	struct rq *busiest = NULL, *rq;
@@ -2471,19 +2664,17 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		unsigned long wl;
 
 		if (!cpu_isset(i, *cpus))
 			continue;
 
 		rq = cpu_rq(i);
-		wl = weighted_cpuload(i);
 
-		if (rq->nr_running == 1 && wl > imbalance)
+		if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
 			continue;
 
-		if (wl > max_load) {
-			max_load = wl;
+		if (rq->raw_weighted_load > max_load) {
+			max_load = rq->raw_weighted_load;
 			busiest = rq;
 		}
 	}
@@ -2507,7 +2698,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
-			struct sched_domain *sd, enum cpu_idle_type idle,
+			struct sched_domain *sd, enum idle_type idle,
 			int *balance)
 {
 	int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
@@ -2520,10 +2711,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
-	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
-	 * portraying it as CPU_NOT_IDLE.
+	 * let the state of idle sibling percolate up as IDLE, instead of
+	 * portraying it as NOT_IDLE.
 	 */
-	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
@@ -2657,7 +2848,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
- * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
+ * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
  * this_rq is locked.
  */
 static int
@@ -2674,31 +2865,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as IDLE, instead of
-	 * portraying it as CPU_NOT_IDLE.
+	 * portraying it as NOT_IDLE.
 	 */
 	if (sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
-	schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
+	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 redo:
-	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
+	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
 				   &sd_idle, &cpus, NULL);
 	if (!group) {
-		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
+		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
+	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
 				&cpus);
 	if (!busiest) {
-		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
+		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
 	BUG_ON(busiest == this_rq);
 
-	schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
+	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
 
 	nr_moved = 0;
 	if (busiest->nr_running > 1) {
@@ -2706,7 +2897,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 					minus_1_or_zero(busiest->nr_running),
-					imbalance, sd, CPU_NEWLY_IDLE, NULL);
+					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
 
 		if (!nr_moved) {
@@ -2717,7 +2908,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	}
 
 	if (!nr_moved) {
-		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
+		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
@@ -2727,7 +2918,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	return nr_moved;
 
 out_balanced:
-	schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
+	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
@@ -2743,8 +2934,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
-	int pulled_task = -1;
-	unsigned long next_balance = jiffies + HZ;
+	int pulled_task = 0;
+	unsigned long next_balance = jiffies + 60 *  HZ;
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -2763,13 +2954,12 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		if (pulled_task)
 			break;
 	}
-	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+	if (!pulled_task)
 		/*
 		 * We are going idle. next_balance may be set based on
 		 * a busy processor. So reset next_balance.
 		 */
 		this_rq->next_balance = next_balance;
-	}
 }
 
 /*
@@ -2813,7 +3003,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 		schedstat_inc(sd, alb_cnt);
 
 		if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
-			       RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
+			       RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
 			       NULL))
 			schedstat_inc(sd, alb_pushed);
 		else
@@ -2822,6 +3012,32 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 	spin_unlock(&target_rq->lock);
 }
 
+static void update_load(struct rq *this_rq)
+{
+	unsigned long this_load;
+	unsigned int i, scale;
+
+	this_load = this_rq->raw_weighted_load;
+
+	/* Update our load: */
+	for (i = 0, scale = 1; i < 3; i++, scale += scale) {
+		unsigned long old_load, new_load;
+
+		/* scale is effectively 1 << i now, and >> i divides by scale */
+
+		old_load = this_rq->cpu_load[i];
+		new_load = this_load;
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
+		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+	}
+}
+
 #ifdef CONFIG_NO_HZ
 static struct {
 	atomic_t load_balancer;
@@ -2904,7 +3120,7 @@ static DEFINE_SPINLOCK(balancing);
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
-static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
+static inline void rebalance_domains(int cpu, enum idle_type idle)
 {
 	int balance = 1;
 	struct rq *rq = cpu_rq(cpu);
@@ -2918,16 +3134,13 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
 			continue;
 
 		interval = sd->balance_interval;
-		if (idle != CPU_IDLE)
+		if (idle != SCHED_IDLE)
 			interval *= sd->busy_factor;
 
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
 		if (unlikely(!interval))
 			interval = 1;
-		if (interval > HZ*NR_CPUS/10)
-			interval = HZ*NR_CPUS/10;
-
 
 		if (sd->flags & SD_SERIALIZE) {
 			if (!spin_trylock(&balancing))
@@ -2941,7 +3154,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
 				 * longer idle, or one of our SMT siblings is
 				 * not idle.
 				 */
-				idle = CPU_NOT_IDLE;
+				idle = NOT_IDLE;
 			}
 			sd->last_balance = jiffies;
 		}
@@ -2969,12 +3182,11 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
  */
 static void run_rebalance_domains(struct softirq_action *h)
 {
-	int this_cpu = smp_processor_id();
-	struct rq *this_rq = cpu_rq(this_cpu);
-	enum cpu_idle_type idle = this_rq->idle_at_tick ?
-						CPU_IDLE : CPU_NOT_IDLE;
+	int local_cpu = smp_processor_id();
+	struct rq *local_rq = cpu_rq(local_cpu);
+	enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
 
-	rebalance_domains(this_cpu, idle);
+	rebalance_domains(local_cpu, idle);
 
 #ifdef CONFIG_NO_HZ
 	/*
@@ -2982,13 +3194,13 @@ static void run_rebalance_domains(struct softirq_action *h)
 	 * balancing on behalf of the other idle cpus whose ticks are
 	 * stopped.
 	 */
-	if (this_rq->idle_at_tick &&
-	    atomic_read(&nohz.load_balancer) == this_cpu) {
+	if (local_rq->idle_at_tick &&
+	    atomic_read(&nohz.load_balancer) == local_cpu) {
 		cpumask_t cpus = nohz.cpu_mask;
 		struct rq *rq;
 		int balance_cpu;
 
-		cpu_clear(this_cpu, cpus);
+		cpu_clear(local_cpu, cpus);
 		for_each_cpu_mask(balance_cpu, cpus) {
 			/*
 			 * If this cpu gets work to do, stop the load balancing
@@ -3001,8 +3213,8 @@ static void run_rebalance_domains(struct softirq_action *h)
 			rebalance_domains(balance_cpu, SCHED_IDLE);
 
 			rq = cpu_rq(balance_cpu);
-			if (time_after(this_rq->next_balance, rq->next_balance))
-				this_rq->next_balance = rq->next_balance;
+			if (time_after(local_rq->next_balance, rq->next_balance))
+				local_rq->next_balance = rq->next_balance;
 		}
 	}
 #endif
@@ -3015,8 +3227,9 @@ static void run_rebalance_domains(struct softirq_action *h)
  * idle load balancing owner or decide to stop the periodic load balancing,
  * if the whole system is idle.
  */
-static inline void trigger_load_balance(struct rq *rq, int cpu)
+static inline void trigger_load_balance(int cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_NO_HZ
 	/*
 	 * If we were in the nohz mode recently and busy at the current
@@ -3068,29 +3281,13 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 }
-
-#else	/* CONFIG_SMP */
-
+#else
 /*
  * on UP we do not need to balance between CPUs:
  */
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
-
-/* Avoid "used but not defined" warning on UP */
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned, unsigned long *load_moved,
-		      int this_best_prio, int best_prio, int best_prio_seen,
-		      struct rq_iterator *iterator)
-{
-	*load_moved = 0;
-
-	return 0;
-}
-
 #endif
 
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3098,27 +3295,53 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked in case the task is currently running.
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static inline void
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
+{
+	p->sched_time += now - p->last_ran;
+	p->last_ran = rq->most_recent_timestamp = now;
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
  */
-unsigned long long task_sched_runtime(struct task_struct *p)
+unsigned long long current_sched_time(const struct task_struct *p)
 {
+	unsigned long long ns;
 	unsigned long flags;
-	u64 ns, delta_exec;
-	struct rq *rq;
 
-	rq = task_rq_lock(p, &flags);
-	ns = p->se.sum_exec_runtime;
-	if (rq->curr == p) {
-		delta_exec = rq_clock(rq) - p->se.exec_start;
-		if ((s64)delta_exec > 0)
-			ns += delta_exec;
-	}
-	task_rq_unlock(rq, &flags);
+	local_irq_save(flags);
+	ns = p->sched_time + sched_clock() - p->last_ran;
+	local_irq_restore(flags);
 
 	return ns;
 }
 
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+static inline int expired_starving(struct rq *rq)
+{
+	if (rq->curr->static_prio > rq->best_expired_prio)
+		return 1;
+	if (!STARVATION_LIMIT || !rq->expired_timestamp)
+		return 0;
+	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+		return 1;
+	return 0;
+}
+
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -3192,6 +3415,81 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 
+static void task_running_tick(struct rq *rq, struct task_struct *p)
+{
+	if (p->array != rq->active) {
+		/* Task has expired but was not scheduled yet */
+		set_tsk_need_resched(p);
+		return;
+	}
+	spin_lock(&rq->lock);
+	/*
+	 * The task was running during this tick - update the
+	 * time slice counter. Note: we do not update a thread's
+	 * priority until it either goes to sleep or uses up its
+	 * timeslice. This makes it possible for interactive tasks
+	 * to use up their timeslices at their highest priority levels.
+	 */
+	if (rt_task(p)) {
+		/*
+		 * RR tasks need a special form of timeslice management.
+		 * FIFO tasks have no timeslices.
+		 */
+		if ((p->policy == SCHED_RR) && !--p->time_slice) {
+			p->time_slice = task_timeslice(p);
+			p->first_time_slice = 0;
+			set_tsk_need_resched(p);
+
+			/* put it at the end of the queue: */
+			requeue_task(p, rq->active);
+		}
+		goto out_unlock;
+	}
+	if (!--p->time_slice) {
+		dequeue_task(p, rq->active);
+		set_tsk_need_resched(p);
+		p->prio = effective_prio(p);
+		p->time_slice = task_timeslice(p);
+		p->first_time_slice = 0;
+
+		if (!rq->expired_timestamp)
+			rq->expired_timestamp = jiffies;
+		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+			enqueue_task(p, rq->expired);
+			if (p->static_prio < rq->best_expired_prio)
+				rq->best_expired_prio = p->static_prio;
+		} else
+			enqueue_task(p, rq->active);
+	} else {
+		/*
+		 * Prevent a too long timeslice allowing a task to monopolize
+		 * the CPU. We do this by splitting up the timeslice into
+		 * smaller pieces.
+		 *
+		 * Note: this does not mean the task's timeslices expire or
+		 * get lost in any way, they just might be preempted by
+		 * another task of equal priority. (one with higher
+		 * priority would have preempted this task already.) We
+		 * requeue this task to the end of the list on this priority
+		 * level, which is in essence a round-robin of tasks with
+		 * equal priority.
+		 *
+		 * This only applies to tasks in the interactive
+		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
+		 */
+		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
+			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
+			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+			(p->array == rq->active)) {
+
+			requeue_task(p, rq->active);
+			set_tsk_need_resched(p);
+		}
+	}
+out_unlock:
+	spin_unlock(&rq->lock);
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -3201,19 +3499,20 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
  */
 void scheduler_tick(void)
 {
+	unsigned long long now = sched_clock();
+	struct task_struct *p = current;
 	int cpu = smp_processor_id();
+	int idle_at_tick = idle_cpu(cpu);
 	struct rq *rq = cpu_rq(cpu);
-	struct task_struct *curr = rq->curr;
 
-	spin_lock(&rq->lock);
-	if (curr != rq->idle) /* FIXME: needed? */
-		curr->sched_class->task_tick(rq, curr);
-	update_cpu_load(rq);
-	spin_unlock(&rq->lock);
+	update_cpu_clock(p, rq, now);
 
+	if (!idle_at_tick)
+		task_running_tick(rq, p);
 #ifdef CONFIG_SMP
-	rq->idle_at_tick = idle_cpu(cpu);
-	trigger_load_balance(rq, cpu);
+	update_load(rq);
+	rq->idle_at_tick = idle_at_tick;
+	trigger_load_balance(cpu);
 #endif
 }
 
@@ -3255,129 +3554,170 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
-/*
- * Print scheduling while atomic bug:
- */
-static noinline void __schedule_bug(struct task_struct *prev)
+static inline int interactive_sleep(enum sleep_type sleep_type)
 {
-	printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
-		prev->comm, preempt_count(), prev->pid);
-	debug_show_held_locks(prev);
-	if (irqs_disabled())
-		print_irqtrace_events(prev);
-	dump_stack();
+	return (sleep_type == SLEEP_INTERACTIVE ||
+		sleep_type == SLEEP_INTERRUPTED);
 }
 
 /*
- * Various schedule()-time debugging checks and statistics:
+ * schedule() is the main scheduler function.
  */
-static inline void schedule_debug(struct task_struct *prev)
+asmlinkage void __sched schedule(void)
 {
+	struct task_struct *prev, *next;
+	struct prio_array *array;
+	struct list_head *queue;
+	unsigned long long now;
+	unsigned long run_time;
+	int cpu, idx, new_prio;
+	long *switch_count;
+	struct rq *rq;
+
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
-		__schedule_bug(prev);
-
+	if (unlikely(in_atomic() && !current->exit_state)) {
+		printk(KERN_ERR "BUG: scheduling while atomic: "
+			"%s/0x%08x/%d\n",
+			current->comm, preempt_count(), current->pid);
+		debug_show_held_locks(current);
+		if (irqs_disabled())
+			print_irqtrace_events(current);
+		dump_stack();
+	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-	schedstat_inc(this_rq(), sched_cnt);
-}
+need_resched:
+	preempt_disable();
+	prev = current;
+	release_kernel_lock(prev);
+need_resched_nonpreemptible:
+	rq = this_rq();
 
-/*
- * Pick up the highest-prio task:
- */
-static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
-{
-	struct sched_class *class;
-	struct task_struct *p;
+	/*
+	 * The idle thread is not allowed to schedule!
+	 * Remove this check after it has been exercised a bit.
+	 */
+	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
+		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+		dump_stack();
+	}
+
+	schedstat_inc(rq, sched_cnt);
+	now = sched_clock();
+	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
+		run_time = now - prev->timestamp;
+		if (unlikely((long long)(now - prev->timestamp) < 0))
+			run_time = 0;
+	} else
+		run_time = NS_MAX_SLEEP_AVG;
 
 	/*
-	 * Optimization: we know that if all tasks are in
-	 * the fair class we can call that function directly:
+	 * Tasks charged proportionately less run_time at high sleep_avg to
+	 * delay them losing their interactive status
 	 */
-	if (likely(rq->nr_running == rq->cfs.nr_running)) {
-		p = fair_sched_class.pick_next_task(rq, now);
-		if (likely(p))
-			return p;
+	run_time /= (CURRENT_BONUS(prev) ? : 1);
+
+	spin_lock_irq(&rq->lock);
+
+	switch_count = &prev->nivcsw;
+	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+		switch_count = &prev->nvcsw;
+		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
+				unlikely(signal_pending(prev))))
+			prev->state = TASK_RUNNING;
+		else {
+			if (prev->state == TASK_UNINTERRUPTIBLE)
+				rq->nr_uninterruptible++;
+			deactivate_task(prev, rq);
+		}
 	}
 
-	class = sched_class_highest;
-	for ( ; ; ) {
-		p = class->pick_next_task(rq, now);
-		if (p)
-			return p;
+	cpu = smp_processor_id();
+	if (unlikely(!rq->nr_running)) {
+		idle_balance(cpu, rq);
+		if (!rq->nr_running) {
+			next = rq->idle;
+			rq->expired_timestamp = 0;
+			goto switch_tasks;
+		}
+	}
+
+	array = rq->active;
+	if (unlikely(!array->nr_active)) {
 		/*
-		 * Will never be NULL as the idle class always
-		 * returns a non-NULL p:
+		 * Switch the active and expired arrays.
 		 */
-		class = class->next;
+		schedstat_inc(rq, sched_switch);
+		rq->active = rq->expired;
+		rq->expired = array;
+		array = rq->active;
+		rq->expired_timestamp = 0;
+		rq->best_expired_prio = MAX_PRIO;
 	}
-}
-
-/*
- * schedule() is the main scheduler function.
- */
-asmlinkage void __sched schedule(void)
-{
-	struct task_struct *prev, *next;
-	long *switch_count;
-	struct rq *rq;
-	u64 now;
-	int cpu;
 
-need_resched:
-	preempt_disable();
-	cpu = smp_processor_id();
-	rq = cpu_rq(cpu);
-	rcu_qsctr_inc(cpu);
-	prev = rq->curr;
-	switch_count = &prev->nivcsw;
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
 
-	release_kernel_lock(prev);
-need_resched_nonpreemptible:
+	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+		unsigned long long delta = now - next->timestamp;
+		if (unlikely((long long)(now - next->timestamp) < 0))
+			delta = 0;
 
-	schedule_debug(prev);
+		if (next->sleep_type == SLEEP_INTERACTIVE)
+			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
-	spin_lock_irq(&rq->lock);
-	clear_tsk_need_resched(prev);
+		array = next->array;
+		new_prio = recalc_task_prio(next, next->timestamp + delta);
 
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
-				unlikely(signal_pending(prev)))) {
-			prev->state = TASK_RUNNING;
-		} else {
-			deactivate_task(rq, prev, 1);
+		if (unlikely(next->prio != new_prio)) {
+			dequeue_task(next, array);
+			next->prio = new_prio;
+			enqueue_task(next, array);
 		}
-		switch_count = &prev->nvcsw;
 	}
+	next->sleep_type = SLEEP_NORMAL;
+switch_tasks:
+	if (next == rq->idle)
+		schedstat_inc(rq, sched_goidle);
+	prefetch(next);
+	prefetch_stack(next);
+	clear_tsk_need_resched(prev);
+	rcu_qsctr_inc(task_cpu(prev));
 
-	if (unlikely(!rq->nr_running))
-		idle_balance(cpu, rq);
+	update_cpu_clock(prev, rq, now);
 
-	now = __rq_clock(rq);
-	prev->sched_class->put_prev_task(rq, prev, now);
-	next = pick_next_task(rq, prev, now);
+	prev->sleep_avg -= run_time;
+	if ((long)prev->sleep_avg <= 0)
+		prev->sleep_avg = 0;
+	prev->timestamp = prev->last_ran = now;
 
 	sched_info_switch(prev, next);
-
 	if (likely(prev != next)) {
+		next->timestamp = next->last_ran = now;
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 
-		context_switch(rq, prev, next); /* unlocks the rq */
+		prepare_task_switch(rq, next);
+		prev = context_switch(rq, prev, next);
+		barrier();
+		/*
+		 * this_rq must be evaluated again because prev may have moved
+		 * CPUs since it called schedule(), thus the 'rq' on its stack
+		 * frame will be invalid.
+		 */
+		finish_task_switch(this_rq(), prev);
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (unlikely(reacquire_kernel_lock(current) < 0)) {
-		cpu = smp_processor_id();
-		rq = cpu_rq(cpu);
+	prev = current;
+	if (unlikely(reacquire_kernel_lock(prev) < 0))
 		goto need_resched_nonpreemptible;
-	}
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
@@ -3705,85 +4045,74 @@ wait_for_completion_interruptible_timeout(struct completion *x,
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 
-static inline void
-sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
-{
-	spin_lock_irqsave(&q->lock, *flags);
-	__add_wait_queue(q, wait);
+
+#define	SLEEP_ON_VAR					\
+	unsigned long flags;				\
+	wait_queue_t wait;				\
+	init_waitqueue_entry(&wait, current);
+
+#define SLEEP_ON_HEAD					\
+	spin_lock_irqsave(&q->lock,flags);		\
+	__add_wait_queue(q, &wait);			\
 	spin_unlock(&q->lock);
-}
 
-static inline void
-sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
-{
-	spin_lock_irq(&q->lock);
-	__remove_wait_queue(q, wait);
-	spin_unlock_irqrestore(&q->lock, *flags);
-}
+#define	SLEEP_ON_TAIL					\
+	spin_lock_irq(&q->lock);			\
+	__remove_wait_queue(q, &wait);			\
+	spin_unlock_irqrestore(&q->lock, flags);
 
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
-	unsigned long flags;
-	wait_queue_t wait;
-
-	init_waitqueue_entry(&wait, current);
+	SLEEP_ON_VAR
 
 	current->state = TASK_INTERRUPTIBLE;
 
-	sleep_on_head(q, &wait, &flags);
+	SLEEP_ON_HEAD
 	schedule();
-	sleep_on_tail(q, &wait, &flags);
+	SLEEP_ON_TAIL
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
 
-long __sched
+long fastcall __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
-	unsigned long flags;
-	wait_queue_t wait;
-
-	init_waitqueue_entry(&wait, current);
+	SLEEP_ON_VAR
 
 	current->state = TASK_INTERRUPTIBLE;
 
-	sleep_on_head(q, &wait, &flags);
+	SLEEP_ON_HEAD
 	timeout = schedule_timeout(timeout);
-	sleep_on_tail(q, &wait, &flags);
+	SLEEP_ON_TAIL
 
 	return timeout;
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 
-void __sched sleep_on(wait_queue_head_t *q)
+void fastcall __sched sleep_on(wait_queue_head_t *q)
 {
-	unsigned long flags;
-	wait_queue_t wait;
-
-	init_waitqueue_entry(&wait, current);
+	SLEEP_ON_VAR
 
 	current->state = TASK_UNINTERRUPTIBLE;
 
-	sleep_on_head(q, &wait, &flags);
+	SLEEP_ON_HEAD
 	schedule();
-	sleep_on_tail(q, &wait, &flags);
+	SLEEP_ON_TAIL
 }
 EXPORT_SYMBOL(sleep_on);
 
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
-	unsigned long flags;
-	wait_queue_t wait;
-
-	init_waitqueue_entry(&wait, current);
+	SLEEP_ON_VAR
 
 	current->state = TASK_UNINTERRUPTIBLE;
 
-	sleep_on_head(q, &wait, &flags);
+	SLEEP_ON_HEAD
 	timeout = schedule_timeout(timeout);
-	sleep_on_tail(q, &wait, &flags);
+	SLEEP_ON_TAIL
 
 	return timeout;
 }
+
 EXPORT_SYMBOL(sleep_on_timeout);
 
 #ifdef CONFIG_RT_MUTEXES
@@ -3800,30 +4129,29 @@ EXPORT_SYMBOL(sleep_on_timeout);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
+	struct prio_array *array;
 	unsigned long flags;
-	int oldprio, on_rq;
 	struct rq *rq;
-	u64 now;
+	int oldprio;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
-	now = rq_clock(rq);
 
 	oldprio = p->prio;
-	on_rq = p->se.on_rq;
-	if (on_rq)
-		dequeue_task(rq, p, 0, now);
-
-	if (rt_prio(prio))
-		p->sched_class = &rt_sched_class;
-	else
-		p->sched_class = &fair_sched_class;
-
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
 	p->prio = prio;
 
-	if (on_rq) {
-		enqueue_task(rq, p, 0, now);
+	if (array) {
+		/*
+		 * If changing to an RT priority then queue it
+		 * in the active array!
+		 */
+		if (rt_task(p))
+			array = rq->active;
+		enqueue_task(p, array);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -3832,9 +4160,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-		} else {
-			check_preempt_curr(rq, p);
-		}
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -3843,10 +4170,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	int old_prio, delta, on_rq;
+	struct prio_array *array;
+	int old_prio, delta;
 	unsigned long flags;
 	struct rq *rq;
-	u64 now;
 
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
@@ -3855,21 +4182,20 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
-	now = rq_clock(rq);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
-	 * SCHED_FIFO/SCHED_RR:
+	 * not SCHED_NORMAL/SCHED_BATCH:
 	 */
-	if (task_has_rt_policy(p)) {
+	if (has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	on_rq = p->se.on_rq;
-	if (on_rq) {
-		dequeue_task(rq, p, 0, now);
-		dec_load(rq, p, now);
+	array = p->array;
+	if (array) {
+		dequeue_task(p, array);
+		dec_raw_weighted_load(rq, p);
 	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
@@ -3878,9 +4204,9 @@ void set_user_nice(struct task_struct *p, long nice)
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 
-	if (on_rq) {
-		enqueue_task(rq, p, 0, now);
-		inc_load(rq, p, now);
+	if (array) {
+		enqueue_task(p, array);
+		inc_raw_weighted_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -4000,28 +4326,20 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
 }
 
 /* Actually do priority change: must hold rq lock. */
-static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+static void __setscheduler(struct task_struct *p, int policy, int prio)
 {
-	BUG_ON(p->se.on_rq);
+	BUG_ON(p->array);
 
 	p->policy = policy;
-	switch (p->policy) {
-	case SCHED_NORMAL:
-	case SCHED_BATCH:
-	case SCHED_IDLE:
-		p->sched_class = &fair_sched_class;
-		break;
-	case SCHED_FIFO:
-	case SCHED_RR:
-		p->sched_class = &rt_sched_class;
-		break;
-	}
-
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
+	/*
+	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+	 */
+	if (policy == SCHED_BATCH)
+		p->sleep_avg = 0;
 	set_load_weight(p);
 }
 
@@ -4036,7 +4354,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
-	int retval, oldprio, oldpolicy = -1, on_rq;
+	int retval, oldprio, oldpolicy = -1;
+	struct prio_array *array;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -4047,27 +4366,27 @@ int sched_setscheduler(struct task_struct *p, int policy,
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-			policy != SCHED_IDLE)
+			policy != SCHED_NORMAL && policy != SCHED_BATCH)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
-	 * SCHED_BATCH and SCHED_IDLE is 0.
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	 * SCHED_BATCH is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
-	if (rt_policy(policy) != (param->sched_priority != 0))
+	if (is_rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
-		if (rt_policy(policy)) {
+		if (is_rt_policy(policy)) {
 			unsigned long rlim_rtprio;
+			unsigned long flags;
 
 			if (!lock_task_sighand(p, &flags))
 				return -ESRCH;
@@ -4083,12 +4402,6 @@ int sched_setscheduler(struct task_struct *p, int policy,
 			    param->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
-		/*
-		 * Like positive nice levels, dont allow tasks to
-		 * move out of SCHED_IDLE either:
-		 */
-		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
-			return -EPERM;
 
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
@@ -4116,13 +4429,13 @@ int sched_setscheduler(struct task_struct *p, int policy,
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
-	on_rq = p->se.on_rq;
-	if (on_rq)
-		deactivate_task(rq, p, 0);
+	array = p->array;
+	if (array)
+		deactivate_task(p, rq);
 	oldprio = p->prio;
-	__setscheduler(rq, p, policy, param->sched_priority);
-	if (on_rq) {
-		activate_task(rq, p, 0);
+	__setscheduler(p, policy, param->sched_priority);
+	if (array) {
+		__activate_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -4131,9 +4444,8 @@ int sched_setscheduler(struct task_struct *p, int policy,
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-		} else {
-			check_preempt_curr(rq, p);
-		}
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4405,18 +4717,41 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
- * This function yields the current CPU to other tasks. If there are no
- * other threads running on this CPU then this function will return.
+ * This function yields the current CPU by moving the calling thread
+ * to the expired array. If there are no other threads running on this
+ * CPU then this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
 	struct rq *rq = this_rq_lock();
+	struct prio_array *array = current->array, *target = rq->expired;
+
+	schedstat_inc(rq, yld_cnt);
+	/*
+	 * We implement yielding by moving the task into the expired
+	 * queue.
+	 *
+	 * (special rule: RT tasks will just roundrobin in the active
+	 *  array.)
+	 */
+	if (rt_task(current))
+		target = rq->active;
 
-	schedstat_inc(rq, yld_cnt);
-	if (unlikely(rq->nr_running == 1))
+	if (array->nr_active == 1) {
 		schedstat_inc(rq, yld_act_empty);
-	else
-		current->sched_class->yield_task(rq, current);
+		if (!rq->expired->nr_active)
+			schedstat_inc(rq, yld_both_empty);
+	} else if (!rq->expired->nr_active)
+		schedstat_inc(rq, yld_exp_empty);
+
+	if (array != target) {
+		dequeue_task(current, array);
+		enqueue_task(current, target);
+	} else
+		/*
+		 * requeue_task is cheaper so perform that if possible.
+		 */
+		requeue_task(current, array);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
@@ -4567,7 +4902,6 @@ asmlinkage long sys_sched_get_priority_max(int policy)
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
-	case SCHED_IDLE:
 		ret = 0;
 		break;
 	}
@@ -4592,7 +4926,6 @@ asmlinkage long sys_sched_get_priority_min(int policy)
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
-	case SCHED_IDLE:
 		ret = 0;
 	}
 	return ret;
@@ -4627,7 +4960,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 		goto out_unlock;
 
 	jiffies_to_timespec(p->policy == SCHED_FIFO ?
-				0 : static_prio_timeslice(p->static_prio), &t);
+				0 : task_timeslice(p), &t);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
@@ -4702,9 +5035,6 @@ void show_state_filter(unsigned long state_filter)
 
 	touch_all_softlockup_watchdogs();
 
-#ifdef CONFIG_SCHED_DEBUG
-	sysrq_sched_debug_show();
-#endif
 	read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
@@ -4713,11 +5043,6 @@ void show_state_filter(unsigned long state_filter)
 		debug_show_all_locks();
 }
 
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
-{
-	idle->sched_class = &idle_sched_class;
-}
-
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
@@ -4731,12 +5056,13 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	__sched_fork(idle);
-	idle->se.exec_start = sched_clock();
-
+	idle->timestamp = sched_clock();
+	idle->sleep_avg = 0;
+	idle->array = NULL;
 	idle->prio = idle->normal_prio = MAX_PRIO;
+	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
-	__set_task_cpu(idle, cpu);
+	set_task_cpu(idle, cpu);
 
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
@@ -4751,10 +5077,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #else
 	task_thread_info(idle)->preempt_count = 0;
 #endif
-	/*
-	 * The idle tasks have their own, simple scheduling class:
-	 */
-	idle->sched_class = &idle_sched_class;
 }
 
 /*
@@ -4766,28 +5088,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static inline void sched_init_granularity(void)
-{
-	unsigned int factor = 1 + ilog2(num_online_cpus());
-	const unsigned long gran_limit = 10000000;
-
-	sysctl_sched_granularity *= factor;
-	if (sysctl_sched_granularity > gran_limit)
-		sysctl_sched_granularity = gran_limit;
-
-	sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
-	sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
-}
-
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -4861,7 +5161,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
-	int ret = 0, on_rq;
+	int ret = 0;
 
 	if (unlikely(cpu_is_offline(dest_cpu)))
 		return ret;
@@ -4877,13 +5177,20 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
-	on_rq = p->se.on_rq;
-	if (on_rq)
-		deactivate_task(rq_src, p, 0);
 	set_task_cpu(p, dest_cpu);
-	if (on_rq) {
-		activate_task(rq_dest, p, 0);
-		check_preempt_curr(rq_dest, p);
+	if (p->array) {
+		/*
+		 * Sync timestamp with rq_dest's before activating.
+		 * The same thing could be achieved by doing this step
+		 * afterwards, and pretending it was a local activate.
+		 * This way is cleaner and logically correct.
+		 */
+		p->timestamp = p->timestamp - rq_src->most_recent_timestamp
+				+ rq_dest->most_recent_timestamp;
+		deactivate_task(p, rq_src);
+		__activate_task(p, rq_dest);
+		if (TASK_PREEMPTS_CURR(p, rq_dest))
+			resched_task(rq_dest->curr);
 	}
 	ret = 1;
 out:
@@ -5035,8 +5342,7 @@ static void migrate_live_tasks(int src_cpu)
 	write_unlock_irq(&tasklist_lock);
 }
 
-/*
- * Schedules idle task to be the next runnable task on current CPU.
+/* Schedules idle task to be the next runnable task on current CPU.
  * It does so by boosting its priority to highest possible and adding it to
  * the _front_ of the runqueue. Used by CPU offline code.
  */
@@ -5056,10 +5362,10 @@ void sched_idle_next(void)
 	 */
 	spin_lock_irqsave(&rq->lock, flags);
 
-	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
+	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
 
 	/* Add idle task to the _front_ of its priority queue: */
-	activate_idle_task(p, rq);
+	__activate_idle_task(p, rq);
 
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -5109,15 +5415,16 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
-	struct task_struct *next;
+	unsigned int arr, i;
 
-	for ( ; ; ) {
-		if (!rq->nr_running)
-			break;
-		next = pick_next_task(rq, rq->curr, rq_clock(rq));
-		if (!next)
-			break;
-		migrate_dead(dead_cpu, next);
+	for (arr = 0; arr < 2; arr++) {
+		for (i = 0; i < MAX_PRIO; i++) {
+			struct list_head *list = &rq->arrays[arr].queue[i];
+
+			while (!list_empty(list))
+				migrate_dead(dead_cpu, list_entry(list->next,
+					     struct task_struct, run_list));
+		}
 	}
 }
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -5141,14 +5448,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
+		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		p->flags |= PF_NOFREEZE;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
-		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
+		__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
 		task_rq_unlock(rq, &flags);
 		cpu_rq(cpu)->migration_thread = p;
 		break;
@@ -5179,10 +5486,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		rq = task_rq_lock(rq->idle, &flags);
-		deactivate_task(rq, rq->idle, 0);
+		deactivate_task(rq->idle, rq);
 		rq->idle->static_prio = MAX_PRIO;
-		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-		rq->idle->sched_class = &idle_sched_class;
+		__setscheduler(rq->idle, SCHED_NORMAL, 0);
 		migrate_dead_tasks(cpu);
 		task_rq_unlock(rq, &flags);
 		migrate_nr_uninterruptible(rq);
@@ -5491,6 +5797,483 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
 
 #define SD_NODES_PER_DOMAIN 16
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE		2
+#define MIN_CACHE_SIZE		(64*1024U)
+#define DEFAULT_CACHE_SIZE	(5*1024*1024U)
+#define ITERATIONS		1
+#define SIZE_THRESH		130
+#define COST_THRESH		130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
+/*
+ * Architectures may override the migration cost and thus avoid
+ * boot-time calibration. Unit is nanoseconds. Mostly useful for
+ * virtualized hardware:
+ */
+#ifdef CONFIG_DEFAULT_MIGRATION_COST
+			CONFIG_DEFAULT_MIGRATION_COST
+#else
+			-1LL
+#endif
+};
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+	int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+
+	printk("#ints: %d\n", ints[0]);
+	for (i = 1; i <= ints[0]; i++) {
+		migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+		printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
+	}
+	return 1;
+}
+
+__setup ("migration_cost=", migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+	get_option(&str, &migration_factor);
+	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+	return 1;
+}
+
+__setup("migration_factor=", setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+static unsigned long domain_distance(int cpu1, int cpu2)
+{
+	unsigned long distance = 0;
+	struct sched_domain *sd;
+
+	for_each_domain(cpu1, sd) {
+		WARN_ON(!cpu_isset(cpu1, sd->span));
+		if (cpu_isset(cpu2, sd->span))
+			return distance;
+		distance++;
+	}
+	if (distance >= MAX_DOMAIN_DISTANCE) {
+		WARN_ON(1);
+		distance = MAX_DOMAIN_DISTANCE-1;
+	}
+
+	return distance;
+}
+
+static unsigned int migration_debug;
+
+static int __init setup_migration_debug(char *str)
+{
+	get_option(&str, &migration_debug);
+	return 1;
+}
+
+__setup("migration_debug=", setup_migration_debug);
+
+/*
+ * Maximum cache-size that the scheduler should try to measure.
+ * Architectures with larger caches should tune this up during
+ * bootup. Gets used in the domain-setup code (i.e. during SMP
+ * bootup).
+ */
+unsigned int max_cache_size;
+
+static int __init setup_max_cache_size(char *str)
+{
+	get_option(&str, &max_cache_size);
+	return 1;
+}
+
+__setup("max_cache_size=", setup_max_cache_size);
+
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, so we try to generate unpredictable
+ * cachemisses that still end up filling the L2 cache:
+ */
+static void touch_cache(void *__cache, unsigned long __size)
+{
+	unsigned long size = __size / sizeof(long);
+	unsigned long chunk1 = size / 3;
+	unsigned long chunk2 = 2 * size / 3;
+	unsigned long *cache = __cache;
+	int i;
+
+	for (i = 0; i < size/6; i += 8) {
+		switch (i % 6) {
+			case 0: cache[i]++;
+			case 1: cache[size-1-i]++;
+			case 2: cache[chunk1-i]++;
+			case 3: cache[chunk1+i]++;
+			case 4: cache[chunk2-i]++;
+			case 5: cache[chunk2+i]++;
+		}
+	}
+}
+
+/*
+ * Measure the cache-cost of one task migration. Returns in units of nsec.
+ */
+static unsigned long long
+measure_one(void *cache, unsigned long size, int source, int target)
+{
+	cpumask_t mask, saved_mask;
+	unsigned long long t0, t1, t2, t3, cost;
+
+	saved_mask = current->cpus_allowed;
+
+	/*
+	 * Flush source caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	/*
+	 * Migrate to the source CPU:
+	 */
+	mask = cpumask_of_cpu(source);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != source);
+
+	/*
+	 * Dirty the working set:
+	 */
+	t0 = sched_clock();
+	touch_cache(cache, size);
+	t1 = sched_clock();
+
+	/*
+	 * Migrate to the target CPU, dirty the L2 cache and access
+	 * the shared buffer. (which represents the working set
+	 * of a migrated task.)
+	 */
+	mask = cpumask_of_cpu(target);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != target);
+
+	t2 = sched_clock();
+	touch_cache(cache, size);
+	t3 = sched_clock();
+
+	cost = t1-t0 + t3-t2;
+
+	if (migration_debug >= 2)
+		printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
+			source, target, t1-t0, t1-t0, t3-t2, cost);
+	/*
+	 * Flush target caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	set_cpus_allowed(current, saved_mask);
+
+	return cost;
+}
+
+/*
+ * Measure a series of task migrations and return the average
+ * result. Since this code runs early during bootup the system
+ * is 'undisturbed' and the average latency makes sense.
+ *
+ * The algorithm in essence auto-detects the relevant cache-size,
+ * so it will properly detect different cachesizes for different
+ * cache-hierarchies, depending on how the CPUs are connected.
+ *
+ * Architectures can prime the upper limit of the search range via
+ * max_cache_size, otherwise the search range defaults to 20MB...64K.
+ */
+static unsigned long long
+measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
+{
+	unsigned long long cost1, cost2;
+	int i;
+
+	/*
+	 * Measure the migration cost of 'size' bytes, over an
+	 * average of 10 runs:
+	 *
+	 * (We perturb the cache size by a small (0..4k)
+	 *  value to compensate size/alignment related artifacts.
+	 *  We also subtract the cost of the operation done on
+	 *  the same CPU.)
+	 */
+	cost1 = 0;
+
+	/*
+	 * dry run, to make sure we start off cache-cold on cpu1,
+	 * and to get any vmalloc pagefaults in advance:
+	 */
+	measure_one(cache, size, cpu1, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
+
+	measure_one(cache, size, cpu2, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
+
+	/*
+	 * (We measure the non-migrating [cached] cost on both
+	 *  cpu1 and cpu2, to handle CPUs with different speeds)
+	 */
+	cost2 = 0;
+
+	measure_one(cache, size, cpu1, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
+
+	measure_one(cache, size, cpu2, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
+
+	/*
+	 * Get the per-iteration migration cost:
+	 */
+	do_div(cost1, 2 * ITERATIONS);
+	do_div(cost2, 2 * ITERATIONS);
+
+	return cost1 - cost2;
+}
+
+static unsigned long long measure_migration_cost(int cpu1, int cpu2)
+{
+	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
+	unsigned int max_size, size, size_found = 0;
+	long long cost = 0, prev_cost;
+	void *cache;
+
+	/*
+	 * Search from max_cache_size*5 down to 64K - the real relevant
+	 * cachesize has to lie somewhere inbetween.
+	 */
+	if (max_cache_size) {
+		max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
+		size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
+	} else {
+		/*
+		 * Since we have no estimation about the relevant
+		 * search range
+		 */
+		max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
+		size = MIN_CACHE_SIZE;
+	}
+
+	if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
+		printk("cpu %d and %d not both online!\n", cpu1, cpu2);
+		return 0;
+	}
+
+	/*
+	 * Allocate the working set:
+	 */
+	cache = vmalloc(max_size);
+	if (!cache) {
+		printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
+		return 1000000; /* return 1 msec on very small boxen */
+	}
+
+	while (size <= max_size) {
+		prev_cost = cost;
+		cost = measure_cost(cpu1, cpu2, cache, size);
+
+		/*
+		 * Update the max:
+		 */
+		if (cost > 0) {
+			if (max_cost < cost) {
+				max_cost = cost;
+				size_found = size;
+			}
+		}
+		/*
+		 * Calculate average fluctuation, we use this to prevent
+		 * noise from triggering an early break out of the loop:
+		 */
+		fluct = abs(cost - prev_cost);
+		avg_fluct = (avg_fluct + fluct)/2;
+
+		if (migration_debug)
+			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
+				"(%8Ld %8Ld)\n",
+				cpu1, cpu2, size,
+				(long)cost / 1000000,
+				((long)cost / 100000) % 10,
+				(long)max_cost / 1000000,
+				((long)max_cost / 100000) % 10,
+				domain_distance(cpu1, cpu2),
+				cost, avg_fluct);
+
+		/*
+		 * If we iterated at least 20% past the previous maximum,
+		 * and the cost has dropped by more than 20% already,
+		 * (taking fluctuations into account) then we assume to
+		 * have found the maximum and break out of the loop early:
+		 */
+		if (size_found && (size*100 > size_found*SIZE_THRESH))
+			if (cost+avg_fluct <= 0 ||
+				max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
+
+				if (migration_debug)
+					printk("-> found max.\n");
+				break;
+			}
+		/*
+		 * Increase the cachesize in 10% steps:
+		 */
+		size = size * 10 / 9;
+	}
+
+	if (migration_debug)
+		printk("[%d][%d] working set size found: %d, cost: %Ld\n",
+			cpu1, cpu2, size_found, max_cost);
+
+	vfree(cache);
+
+	/*
+	 * A task is considered 'cache cold' if at least 2 times
+	 * the worst-case cost of migration has passed.
+	 *
+	 * (this limit is only listened to if the load-balancing
+	 * situation is 'nice' - if there is a large imbalance we
+	 * ignore it for the sake of CPU utilization and
+	 * processing fairness.)
+	 */
+	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
+}
+
+static void calibrate_migration_costs(const cpumask_t *cpu_map)
+{
+	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
+	unsigned long j0, j1, distance, max_distance = 0;
+	struct sched_domain *sd;
+
+	j0 = jiffies;
+
+	/*
+	 * First pass - calculate the cacheflush times:
+	 */
+	for_each_cpu_mask(cpu1, *cpu_map) {
+		for_each_cpu_mask(cpu2, *cpu_map) {
+			if (cpu1 == cpu2)
+				continue;
+			distance = domain_distance(cpu1, cpu2);
+			max_distance = max(max_distance, distance);
+			/*
+			 * No result cached yet?
+			 */
+			if (migration_cost[distance] == -1LL)
+				migration_cost[distance] =
+					measure_migration_cost(cpu1, cpu2);
+		}
+	}
+	/*
+	 * Second pass - update the sched domain hierarchy with
+	 * the new cache-hot-time estimations:
+	 */
+	for_each_cpu_mask(cpu, *cpu_map) {
+		distance = 0;
+		for_each_domain(cpu, sd) {
+			sd->cache_hot_time = migration_cost[distance];
+			distance++;
+		}
+	}
+	/*
+	 * Print the matrix:
+	 */
+	if (migration_debug)
+		printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
+			max_cache_size,
+#ifdef CONFIG_X86
+			cpu_khz/1000
+#else
+			-1
+#endif
+		);
+	if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
+		printk("migration_cost=");
+		for (distance = 0; distance <= max_distance; distance++) {
+			if (distance)
+				printk(",");
+			printk("%ld", (long)migration_cost[distance] / 1000);
+		}
+		printk("\n");
+	}
+	j1 = jiffies;
+	if (migration_debug)
+		printk("migration: %ld seconds\n", (j1-j0) / HZ);
+
+	/*
+	 * Move back to the original CPU. NUMA-Q gets confused
+	 * if we migrate to another quad during bootup.
+	 */
+	if (raw_smp_processor_id() != orig_cpu) {
+		cpumask_t mask = cpumask_of_cpu(orig_cpu),
+			saved_mask = current->cpus_allowed;
+
+		set_cpus_allowed(current, mask);
+		set_cpus_allowed(current, saved_mask);
+	}
+}
+
 #ifdef CONFIG_NUMA
 
 /**
@@ -5791,6 +6574,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
+	struct sched_domain *sd;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
@@ -5798,7 +6582,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
+	sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
 					   GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -5817,8 +6601,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
-		if (cpus_weight(*cpu_map) >
-				SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+		if (cpus_weight(*cpu_map)
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
@@ -5877,8 +6661,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		if (i != first_cpu(this_sibling_map))
 			continue;
 
-		init_sched_build_groups(this_sibling_map, cpu_map,
-					&cpu_to_cpu_group);
+		init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
 	}
 #endif
 
@@ -5889,11 +6672,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(this_core_map, this_core_map, *cpu_map);
 		if (i != first_cpu(this_core_map))
 			continue;
-		init_sched_build_groups(this_core_map, cpu_map,
-					&cpu_to_core_group);
+		init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
 	}
 #endif
 
+
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
@@ -5908,8 +6691,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
 	if (sd_allnodes)
-		init_sched_build_groups(*cpu_map, cpu_map,
-					&cpu_to_allnodes_group);
+		init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
@@ -5937,7 +6719,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		sched_group_nodes[i] = sg;
 		for_each_cpu_mask(j, nodemask) {
 			struct sched_domain *sd;
-
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
 		}
@@ -5982,22 +6763,19 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu_mask(i, *cpu_map) {
-		struct sched_domain *sd = &per_cpu(cpu_domains, i);
-
+		sd = &per_cpu(cpu_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu_mask(i, *cpu_map) {
-		struct sched_domain *sd = &per_cpu(core_domains, i);
-
+		sd = &per_cpu(core_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
 	for_each_cpu_mask(i, *cpu_map) {
-		struct sched_domain *sd = &per_cpu(phys_domains, i);
-
+		sd = &per_cpu(phys_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 
@@ -6025,6 +6803,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #endif
 		cpu_attach_domain(sd, i);
 	}
+	/*
+	 * Tune cache-hot values:
+	 */
+	calibrate_migration_costs(cpu_map);
 
 	return 0;
 
@@ -6231,12 +7013,10 @@ void __init sched_init_smp(void)
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
-	sched_init_granularity();
 }
 #else
 void __init sched_init_smp(void)
 {
-	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 
@@ -6250,51 +7030,28 @@ int in_sched_functions(unsigned long addr)
 		&& addr < (unsigned long)__sched_text_end);
 }
 
-static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
-{
-	cfs_rq->tasks_timeline = RB_ROOT;
-	cfs_rq->fair_clock = 1;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	cfs_rq->rq = rq;
-#endif
-}
-
 void __init sched_init(void)
 {
-	u64 now = sched_clock();
+	int i, j, k;
 	int highest_cpu = 0;
-	int i, j;
-
-	/*
-	 * Link up the scheduling class hierarchy:
-	 */
-	rt_sched_class.next = &fair_sched_class;
-	fair_sched_class.next = &idle_sched_class;
-	idle_sched_class.next = NULL;
 
 	for_each_possible_cpu(i) {
-		struct rt_prio_array *array;
+		struct prio_array *array;
 		struct rq *rq;
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
-		rq->clock = 1;
-		init_cfs_rq(&rq->cfs, rq);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-		list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
-#endif
-		rq->ls.load_update_last = now;
-		rq->ls.load_update_start = now;
+		rq->active = rq->arrays;
+		rq->expired = rq->arrays + 1;
+		rq->best_expired_prio = MAX_PRIO;
 
-		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
-			rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
+		for (j = 1; j < 3; j++)
+			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
-		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->migration_thread = NULL;
@@ -6302,14 +7059,16 @@ void __init sched_init(void)
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
-		array = &rq->rt.active;
-		for (j = 0; j < MAX_RT_PRIO; j++) {
-			INIT_LIST_HEAD(array->queue + j);
-			__clear_bit(j, array->bitmap);
+		for (j = 0; j < 2; j++) {
+			array = rq->arrays + j;
+			for (k = 0; k < MAX_PRIO; k++) {
+				INIT_LIST_HEAD(array->queue + k);
+				__clear_bit(k, array->bitmap);
+			}
+			// delimiter for bitsearch
+			__set_bit(MAX_PRIO, array->bitmap);
 		}
 		highest_cpu = i;
-		/* delimiter for bitsearch: */
-		__set_bit(MAX_RT_PRIO, array->bitmap);
 	}
 
 	set_load_weight(&init_task);
@@ -6336,10 +7095,6 @@ void __init sched_init(void)
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
-	/*
-	 * During early bootup we pretend to be a normal task:
-	 */
-	current->sched_class = &fair_sched_class;
 }
 
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6370,55 +7125,29 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 void normalize_rt_tasks(void)
 {
+	struct prio_array *array;
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
-	int on_rq;
 
 	read_lock_irq(&tasklist_lock);
+
 	do_each_thread(g, p) {
-		p->se.fair_key			= 0;
-		p->se.wait_runtime		= 0;
-		p->se.wait_start_fair		= 0;
-		p->se.wait_start		= 0;
-		p->se.exec_start		= 0;
-		p->se.sleep_start		= 0;
-		p->se.sleep_start_fair		= 0;
-		p->se.block_start		= 0;
-		task_rq(p)->cfs.fair_clock	= 0;
-		task_rq(p)->clock		= 0;
-
-		if (!rt_task(p)) {
-			/*
-			 * Renice negative nice level userspace
-			 * tasks back to 0:
-			 */
-			if (TASK_NICE(p) < 0 && p->mm)
-				set_user_nice(p, 0);
+		if (!rt_task(p))
 			continue;
-		}
 
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
-#ifdef CONFIG_SMP
-		/*
-		 * Do not touch the migration thread:
-		 */
-		if (p == rq->migration_thread)
-			goto out_unlock;
-#endif
 
-		on_rq = p->se.on_rq;
-		if (on_rq)
-			deactivate_task(task_rq(p), p, 0);
-		__setscheduler(rq, p, SCHED_NORMAL, 0);
-		if (on_rq) {
-			activate_task(task_rq(p), p, 0);
+		array = p->array;
+		if (array)
+			deactivate_task(p, task_rq(p));
+		__setscheduler(p, SCHED_NORMAL, 0);
+		if (array) {
+			__activate_task(p, task_rq(p));
 			resched_task(rq->curr);
 		}
-#ifdef CONFIG_SMP
- out_unlock:
-#endif
+
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 	} while_each_thread(g, p);
diff --git a/trunk/kernel/sched_debug.c b/trunk/kernel/sched_debug.c
deleted file mode 100644
index 1baf87cceb7c..000000000000
--- a/trunk/kernel/sched_debug.c
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * kernel/time/sched_debug.c
- *
- * Print the CFS rbtree
- *
- * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/proc_fs.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-
-/*
- * This allows printing both to /proc/sched_debug and
- * to the console
- */
-#define SEQ_printf(m, x...)			\
- do {						\
-	if (m)					\
-		seq_printf(m, x);		\
-	else					\
-		printk(x);			\
- } while (0)
-
-static void
-print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
-{
-	if (rq->curr == p)
-		SEQ_printf(m, "R");
-	else
-		SEQ_printf(m, " ");
-
-	SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d "
-		      "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
-		p->comm, p->pid,
-		(long long)p->se.fair_key,
-		(long long)(p->se.fair_key - rq->cfs.fair_clock),
-		(long long)p->se.wait_runtime,
-		(long long)(p->nvcsw + p->nivcsw),
-		p->prio,
-		(long long)p->se.sum_exec_runtime,
-		(long long)p->se.sum_wait_runtime,
-		(long long)p->se.sum_sleep_runtime,
-		(long long)p->se.wait_runtime_overruns,
-		(long long)p->se.wait_runtime_underruns);
-}
-
-static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
-{
-	struct task_struct *g, *p;
-
-	SEQ_printf(m,
-	"\nrunnable tasks:\n"
-	"            task   PID        tree-key         delta       waiting"
-	"  switches  prio"
-	"        sum-exec        sum-wait       sum-sleep"
-	"    wait-overrun   wait-underrun\n"
-	"------------------------------------------------------------------"
-	"----------------"
-	"------------------------------------------------"
-	"--------------------------------\n");
-
-	read_lock_irq(&tasklist_lock);
-
-	do_each_thread(g, p) {
-		if (!p->se.on_rq || task_cpu(p) != rq_cpu)
-			continue;
-
-		print_task(m, rq, p, now);
-	} while_each_thread(g, p);
-
-	read_unlock_irq(&tasklist_lock);
-}
-
-static void
-print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
-{
-	s64 wait_runtime_rq_sum = 0;
-	struct task_struct *p;
-	struct rb_node *curr;
-	unsigned long flags;
-	struct rq *rq = &per_cpu(runqueues, cpu);
-
-	spin_lock_irqsave(&rq->lock, flags);
-	curr = first_fair(cfs_rq);
-	while (curr) {
-		p = rb_entry(curr, struct task_struct, se.run_node);
-		wait_runtime_rq_sum += p->se.wait_runtime;
-
-		curr = rb_next(curr);
-	}
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	SEQ_printf(m, "  .%-30s: %Ld\n", "wait_runtime_rq_sum",
-		(long long)wait_runtime_rq_sum);
-}
-
-void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
-{
-	SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
-
-#define P(x) \
-	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
-
-	P(fair_clock);
-	P(exec_clock);
-	P(wait_runtime);
-	P(wait_runtime_overruns);
-	P(wait_runtime_underruns);
-	P(sleeper_bonus);
-#undef P
-
-	print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
-}
-
-static void print_cpu(struct seq_file *m, int cpu, u64 now)
-{
-	struct rq *rq = &per_cpu(runqueues, cpu);
-
-#ifdef CONFIG_X86
-	{
-		unsigned int freq = cpu_khz ? : 1;
-
-		SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
-			   cpu, freq / 1000, (freq % 1000));
-	}
-#else
-	SEQ_printf(m, "\ncpu#%d\n", cpu);
-#endif
-
-#define P(x) \
-	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
-
-	P(nr_running);
-	SEQ_printf(m, "  .%-30s: %lu\n", "load",
-		   rq->ls.load.weight);
-	P(ls.delta_fair);
-	P(ls.delta_exec);
-	P(nr_switches);
-	P(nr_load_updates);
-	P(nr_uninterruptible);
-	SEQ_printf(m, "  .%-30s: %lu\n", "jiffies", jiffies);
-	P(next_balance);
-	P(curr->pid);
-	P(clock);
-	P(prev_clock_raw);
-	P(clock_warps);
-	P(clock_overflows);
-	P(clock_unstable_events);
-	P(clock_max_delta);
-	P(cpu_load[0]);
-	P(cpu_load[1]);
-	P(cpu_load[2]);
-	P(cpu_load[3]);
-	P(cpu_load[4]);
-#undef P
-
-	print_cfs_stats(m, cpu, now);
-
-	print_rq(m, rq, cpu, now);
-}
-
-static int sched_debug_show(struct seq_file *m, void *v)
-{
-	u64 now = ktime_to_ns(ktime_get());
-	int cpu;
-
-	SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v20, %s %.*s\n",
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
-
-	SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
-
-	for_each_online_cpu(cpu)
-		print_cpu(m, cpu, now);
-
-	SEQ_printf(m, "\n");
-
-	return 0;
-}
-
-void sysrq_sched_debug_show(void)
-{
-	sched_debug_show(NULL, NULL);
-}
-
-static int sched_debug_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, sched_debug_show, NULL);
-}
-
-static struct file_operations sched_debug_fops = {
-	.open		= sched_debug_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int __init init_sched_debug_procfs(void)
-{
-	struct proc_dir_entry *pe;
-
-	pe = create_proc_entry("sched_debug", 0644, NULL);
-	if (!pe)
-		return -ENOMEM;
-
-	pe->proc_fops = &sched_debug_fops;
-
-	return 0;
-}
-
-__initcall(init_sched_debug_procfs);
-
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
-{
-	unsigned long flags;
-	int num_threads = 1;
-
-	rcu_read_lock();
-	if (lock_task_sighand(p, &flags)) {
-		num_threads = atomic_read(&p->signal->count);
-		unlock_task_sighand(p, &flags);
-	}
-	rcu_read_unlock();
-
-	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
-	SEQ_printf(m, "----------------------------------------------\n");
-#define P(F) \
-	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
-
-	P(se.wait_start);
-	P(se.wait_start_fair);
-	P(se.exec_start);
-	P(se.sleep_start);
-	P(se.sleep_start_fair);
-	P(se.block_start);
-	P(se.sleep_max);
-	P(se.block_max);
-	P(se.exec_max);
-	P(se.wait_max);
-	P(se.wait_runtime);
-	P(se.wait_runtime_overruns);
-	P(se.wait_runtime_underruns);
-	P(se.sum_wait_runtime);
-	P(se.sum_exec_runtime);
-	SEQ_printf(m, "%-25s:%20Ld\n",
-		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
-	P(se.load.weight);
-	P(policy);
-	P(prio);
-#undef P
-
-	{
-		u64 t0, t1;
-
-		t0 = sched_clock();
-		t1 = sched_clock();
-		SEQ_printf(m, "%-25s:%20Ld\n",
-			   "clock-delta", (long long)(t1-t0));
-	}
-}
-
-void proc_sched_set_task(struct task_struct *p)
-{
-	p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
-	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
-	p->se.sum_exec_runtime = 0;
-}
diff --git a/trunk/kernel/sched_fair.c b/trunk/kernel/sched_fair.c
deleted file mode 100644
index 6971db0a7160..000000000000
--- a/trunk/kernel/sched_fair.c
+++ /dev/null
@@ -1,1131 +0,0 @@
-/*
- * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
- *
- *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- *  Interactivity improvements by Mike Galbraith
- *  (C) 2007 Mike Galbraith <efault@gmx.de>
- *
- *  Various enhancements by Dmitry Adamushko.
- *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
- *
- *  Group scheduling enhancements by Srivatsa Vaddagiri
- *  Copyright IBM Corporation, 2007
- *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
- *
- *  Scaled math optimizations by Thomas Gleixner
- *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
- */
-
-/*
- * Preemption granularity:
- * (default: 2 msec, units: nanoseconds)
- *
- * NOTE: this granularity value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS will typically be somewhat
- * larger than this value. (to see the precise effective timeslice
- * length of your workload, run vmstat and monitor the context-switches
- * field)
- *
- * On SMP systems the value of this is multiplied by the log2 of the
- * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
- * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
- */
-unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
-
-/*
- * SCHED_BATCH wake-up granularity.
- * (default: 10 msec, units: nanoseconds)
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- */
-unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
-							10000000000ULL/HZ;
-
-/*
- * SCHED_OTHER wake-up granularity.
- * (default: 1 msec, units: nanoseconds)
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- */
-unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ;
-
-unsigned int sysctl_sched_stat_granularity __read_mostly;
-
-/*
- * Initialized in sched_init_granularity():
- */
-unsigned int sysctl_sched_runtime_limit __read_mostly;
-
-/*
- * Debugging: various feature bits
- */
-enum {
-	SCHED_FEAT_FAIR_SLEEPERS	= 1,
-	SCHED_FEAT_SLEEPER_AVG		= 2,
-	SCHED_FEAT_SLEEPER_LOAD_AVG	= 4,
-	SCHED_FEAT_PRECISE_CPU_LOAD	= 8,
-	SCHED_FEAT_START_DEBIT		= 16,
-	SCHED_FEAT_SKIP_INITIAL		= 32,
-};
-
-unsigned int sysctl_sched_features __read_mostly =
-		SCHED_FEAT_FAIR_SLEEPERS	*1 |
-		SCHED_FEAT_SLEEPER_AVG		*1 |
-		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
-		SCHED_FEAT_PRECISE_CPU_LOAD	*1 |
-		SCHED_FEAT_START_DEBIT		*1 |
-		SCHED_FEAT_SKIP_INITIAL		*0;
-
-extern struct sched_class fair_sched_class;
-
-/**************************************************************
- * CFS operations on generic schedulable entities:
- */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->rq;
-}
-
-/* currently running entity (if any) on this cfs_rq */
-static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->curr;
-}
-
-/* An entity is a task if it doesn't "own" a runqueue */
-#define entity_is_task(se)	(!se->my_q)
-
-static inline void
-set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	cfs_rq->curr = se;
-}
-
-#else	/* CONFIG_FAIR_GROUP_SCHED */
-
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-	return container_of(cfs_rq, struct rq, cfs);
-}
-
-static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
-{
-	struct rq *rq = rq_of(cfs_rq);
-
-	if (unlikely(rq->curr->sched_class != &fair_sched_class))
-		return NULL;
-
-	return &rq->curr->se;
-}
-
-#define entity_is_task(se)	1
-
-static inline void
-set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
-
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-	return container_of(se, struct task_struct, se);
-}
-
-
-/**************************************************************
- * Scheduling class tree data structure manipulation methods:
- */
-
-/*
- * Enqueue an entity into the rb-tree:
- */
-static inline void
-__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
-	struct rb_node *parent = NULL;
-	struct sched_entity *entry;
-	s64 key = se->fair_key;
-	int leftmost = 1;
-
-	/*
-	 * Find the right place in the rbtree:
-	 */
-	while (*link) {
-		parent = *link;
-		entry = rb_entry(parent, struct sched_entity, run_node);
-		/*
-		 * We dont care about collisions. Nodes with
-		 * the same key stay together.
-		 */
-		if (key - entry->fair_key < 0) {
-			link = &parent->rb_left;
-		} else {
-			link = &parent->rb_right;
-			leftmost = 0;
-		}
-	}
-
-	/*
-	 * Maintain a cache of leftmost tree entries (it is frequently
-	 * used):
-	 */
-	if (leftmost)
-		cfs_rq->rb_leftmost = &se->run_node;
-
-	rb_link_node(&se->run_node, parent, link);
-	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
-	update_load_add(&cfs_rq->load, se->load.weight);
-	cfs_rq->nr_running++;
-	se->on_rq = 1;
-}
-
-static inline void
-__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	if (cfs_rq->rb_leftmost == &se->run_node)
-		cfs_rq->rb_leftmost = rb_next(&se->run_node);
-	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
-	update_load_sub(&cfs_rq->load, se->load.weight);
-	cfs_rq->nr_running--;
-	se->on_rq = 0;
-}
-
-static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->rb_leftmost;
-}
-
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
-{
-	return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
-}
-
-/**************************************************************
- * Scheduling class statistics methods:
- */
-
-/*
- * We rescale the rescheduling granularity of tasks according to their
- * nice level, but only linearly, not exponentially:
- */
-static long
-niced_granularity(struct sched_entity *curr, unsigned long granularity)
-{
-	u64 tmp;
-
-	/*
-	 * Negative nice levels get the same granularity as nice-0:
-	 */
-	if (likely(curr->load.weight >= NICE_0_LOAD))
-		return granularity;
-	/*
-	 * Positive nice level tasks get linearly finer
-	 * granularity:
-	 */
-	tmp = curr->load.weight * (u64)granularity;
-
-	/*
-	 * It will always fit into 'long':
-	 */
-	return (long) (tmp >> NICE_0_SHIFT);
-}
-
-static inline void
-limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	long limit = sysctl_sched_runtime_limit;
-
-	/*
-	 * Niced tasks have the same history dynamic range as
-	 * non-niced tasks:
-	 */
-	if (unlikely(se->wait_runtime > limit)) {
-		se->wait_runtime = limit;
-		schedstat_inc(se, wait_runtime_overruns);
-		schedstat_inc(cfs_rq, wait_runtime_overruns);
-	}
-	if (unlikely(se->wait_runtime < -limit)) {
-		se->wait_runtime = -limit;
-		schedstat_inc(se, wait_runtime_underruns);
-		schedstat_inc(cfs_rq, wait_runtime_underruns);
-	}
-}
-
-static inline void
-__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
-{
-	se->wait_runtime += delta;
-	schedstat_add(se, sum_wait_runtime, delta);
-	limit_wait_runtime(cfs_rq, se);
-}
-
-static void
-add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
-{
-	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
-	__add_wait_runtime(cfs_rq, se, delta);
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
-}
-
-/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
-{
-	unsigned long delta, delta_exec, delta_fair;
-	long delta_mine;
-	struct load_weight *lw = &cfs_rq->load;
-	unsigned long load = lw->weight;
-
-	if (unlikely(!load))
-		return;
-
-	delta_exec = curr->delta_exec;
-#ifdef CONFIG_SCHEDSTATS
-	if (unlikely(delta_exec > curr->exec_max))
-		curr->exec_max = delta_exec;
-#endif
-
-	curr->sum_exec_runtime += delta_exec;
-	cfs_rq->exec_clock += delta_exec;
-
-	delta_fair = calc_delta_fair(delta_exec, lw);
-	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
-
-	if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) {
-		delta = calc_delta_mine(cfs_rq->sleeper_bonus,
-					curr->load.weight, lw);
-		if (unlikely(delta > cfs_rq->sleeper_bonus))
-			delta = cfs_rq->sleeper_bonus;
-
-		cfs_rq->sleeper_bonus -= delta;
-		delta_mine -= delta;
-	}
-
-	cfs_rq->fair_clock += delta_fair;
-	/*
-	 * We executed delta_exec amount of time on the CPU,
-	 * but we were only entitled to delta_mine amount of
-	 * time during that period (if nr_running == 1 then
-	 * the two values are equal)
-	 * [Note: delta_mine - delta_exec is negative]:
-	 */
-	add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
-}
-
-static void update_curr(struct cfs_rq *cfs_rq, u64 now)
-{
-	struct sched_entity *curr = cfs_rq_curr(cfs_rq);
-	unsigned long delta_exec;
-
-	if (unlikely(!curr))
-		return;
-
-	/*
-	 * Get the amount of time the current task was running
-	 * since the last time we changed load (this cannot
-	 * overflow on 32 bits):
-	 */
-	delta_exec = (unsigned long)(now - curr->exec_start);
-
-	curr->delta_exec += delta_exec;
-
-	if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
-		__update_curr(cfs_rq, curr, now);
-		curr->delta_exec = 0;
-	}
-	curr->exec_start = now;
-}
-
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	se->wait_start_fair = cfs_rq->fair_clock;
-	se->wait_start = now;
-}
-
-/*
- * We calculate fair deltas here, so protect against the random effects
- * of a multiplication overflow by capping it to the runtime limit:
- */
-#if BITS_PER_LONG == 32
-static inline unsigned long
-calc_weighted(unsigned long delta, unsigned long weight, int shift)
-{
-	u64 tmp = (u64)delta * weight >> shift;
-
-	if (unlikely(tmp > sysctl_sched_runtime_limit*2))
-		return sysctl_sched_runtime_limit*2;
-	return tmp;
-}
-#else
-static inline unsigned long
-calc_weighted(unsigned long delta, unsigned long weight, int shift)
-{
-	return delta * weight >> shift;
-}
-#endif
-
-/*
- * Task is being enqueued - update stats:
- */
-static void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	s64 key;
-
-	/*
-	 * Are we enqueueing a waiting task? (for current tasks
-	 * a dequeue/enqueue event is a NOP)
-	 */
-	if (se != cfs_rq_curr(cfs_rq))
-		update_stats_wait_start(cfs_rq, se, now);
-	/*
-	 * Update the key:
-	 */
-	key = cfs_rq->fair_clock;
-
-	/*
-	 * Optimize the common nice 0 case:
-	 */
-	if (likely(se->load.weight == NICE_0_LOAD)) {
-		key -= se->wait_runtime;
-	} else {
-		u64 tmp;
-
-		if (se->wait_runtime < 0) {
-			tmp = -se->wait_runtime;
-			key += (tmp * se->load.inv_weight) >>
-					(WMULT_SHIFT - NICE_0_SHIFT);
-		} else {
-			tmp = se->wait_runtime;
-			key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
-		}
-	}
-
-	se->fair_key = key;
-}
-
-/*
- * Note: must be called with a freshly updated rq->fair_clock.
- */
-static inline void
-__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	unsigned long delta_fair = se->delta_fair_run;
-
-#ifdef CONFIG_SCHEDSTATS
-	{
-		s64 delta_wait = now - se->wait_start;
-		if (unlikely(delta_wait > se->wait_max))
-			se->wait_max = delta_wait;
-	}
-#endif
-
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta_fair = calc_weighted(delta_fair, se->load.weight,
-							NICE_0_SHIFT);
-
-	add_wait_runtime(cfs_rq, se, delta_fair);
-}
-
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	unsigned long delta_fair;
-
-	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
-			(u64)(cfs_rq->fair_clock - se->wait_start_fair));
-
-	se->delta_fair_run += delta_fair;
-	if (unlikely(abs(se->delta_fair_run) >=
-				sysctl_sched_stat_granularity)) {
-		__update_stats_wait_end(cfs_rq, se, now);
-		se->delta_fair_run = 0;
-	}
-
-	se->wait_start_fair = 0;
-	se->wait_start = 0;
-}
-
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	update_curr(cfs_rq, now);
-	/*
-	 * Mark the end of the wait period if dequeueing a
-	 * waiting task:
-	 */
-	if (se != cfs_rq_curr(cfs_rq))
-		update_stats_wait_end(cfs_rq, se, now);
-}
-
-/*
- * We are picking a new current task - update its stats:
- */
-static inline void
-update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	/*
-	 * We are starting a new run period:
-	 */
-	se->exec_start = now;
-}
-
-/*
- * We are descheduling a task - update its stats:
- */
-static inline void
-update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	se->exec_start = 0;
-}
-
-/**************************************************
- * Scheduling class queueing methods:
- */
-
-static void
-__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	unsigned long load = cfs_rq->load.weight, delta_fair;
-	long prev_runtime;
-
-	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
-		load = rq_of(cfs_rq)->cpu_load[2];
-
-	delta_fair = se->delta_fair_sleep;
-
-	/*
-	 * Fix up delta_fair with the effect of us running
-	 * during the whole sleep period:
-	 */
-	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
-		delta_fair = div64_likely32((u64)delta_fair * load,
-						load + se->load.weight);
-
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta_fair = calc_weighted(delta_fair, se->load.weight,
-							NICE_0_SHIFT);
-
-	prev_runtime = se->wait_runtime;
-	__add_wait_runtime(cfs_rq, se, delta_fair);
-	delta_fair = se->wait_runtime - prev_runtime;
-
-	/*
-	 * Track the amount of bonus we've given to sleepers:
-	 */
-	cfs_rq->sleeper_bonus += delta_fair;
-
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
-}
-
-static void
-enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	struct task_struct *tsk = task_of(se);
-	unsigned long delta_fair;
-
-	if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
-			 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
-		return;
-
-	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
-		(u64)(cfs_rq->fair_clock - se->sleep_start_fair));
-
-	se->delta_fair_sleep += delta_fair;
-	if (unlikely(abs(se->delta_fair_sleep) >=
-				sysctl_sched_stat_granularity)) {
-		__enqueue_sleeper(cfs_rq, se, now);
-		se->delta_fair_sleep = 0;
-	}
-
-	se->sleep_start_fair = 0;
-
-#ifdef CONFIG_SCHEDSTATS
-	if (se->sleep_start) {
-		u64 delta = now - se->sleep_start;
-
-		if ((s64)delta < 0)
-			delta = 0;
-
-		if (unlikely(delta > se->sleep_max))
-			se->sleep_max = delta;
-
-		se->sleep_start = 0;
-		se->sum_sleep_runtime += delta;
-	}
-	if (se->block_start) {
-		u64 delta = now - se->block_start;
-
-		if ((s64)delta < 0)
-			delta = 0;
-
-		if (unlikely(delta > se->block_max))
-			se->block_max = delta;
-
-		se->block_start = 0;
-		se->sum_sleep_runtime += delta;
-	}
-#endif
-}
-
-static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-	       int wakeup, u64 now)
-{
-	/*
-	 * Update the fair clock.
-	 */
-	update_curr(cfs_rq, now);
-
-	if (wakeup)
-		enqueue_sleeper(cfs_rq, se, now);
-
-	update_stats_enqueue(cfs_rq, se, now);
-	__enqueue_entity(cfs_rq, se);
-}
-
-static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-	       int sleep, u64 now)
-{
-	update_stats_dequeue(cfs_rq, se, now);
-	if (sleep) {
-		se->sleep_start_fair = cfs_rq->fair_clock;
-#ifdef CONFIG_SCHEDSTATS
-		if (entity_is_task(se)) {
-			struct task_struct *tsk = task_of(se);
-
-			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->sleep_start = now;
-			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->block_start = now;
-		}
-		cfs_rq->wait_runtime -= se->wait_runtime;
-#endif
-	}
-	__dequeue_entity(cfs_rq, se);
-}
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			  struct sched_entity *curr, unsigned long granularity)
-{
-	s64 __delta = curr->fair_key - se->fair_key;
-
-	/*
-	 * Take scheduling granularity into account - do not
-	 * preempt the current task unless the best task has
-	 * a larger than sched_granularity fairness advantage:
-	 */
-	if (__delta > niced_granularity(curr, granularity))
-		resched_task(rq_of(cfs_rq)->curr);
-}
-
-static inline void
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
-{
-	/*
-	 * Any task has to be enqueued before it get to execute on
-	 * a CPU. So account for the time it spent waiting on the
-	 * runqueue. (note, here we rely on pick_next_task() having
-	 * done a put_prev_task_fair() shortly before this, which
-	 * updated rq->fair_clock - used by update_stats_wait_end())
-	 */
-	update_stats_wait_end(cfs_rq, se, now);
-	update_stats_curr_start(cfs_rq, se, now);
-	set_cfs_rq_curr(cfs_rq, se);
-}
-
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
-{
-	struct sched_entity *se = __pick_next_entity(cfs_rq);
-
-	set_next_entity(cfs_rq, se, now);
-
-	return se;
-}
-
-static void
-put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
-{
-	/*
-	 * If still on the runqueue then deactivate_task()
-	 * was not called and update_curr() has to be done:
-	 */
-	if (prev->on_rq)
-		update_curr(cfs_rq, now);
-
-	update_stats_curr_end(cfs_rq, prev, now);
-
-	if (prev->on_rq)
-		update_stats_wait_start(cfs_rq, prev, now);
-	set_cfs_rq_curr(cfs_rq, NULL);
-}
-
-static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
-	struct rq *rq = rq_of(cfs_rq);
-	struct sched_entity *next;
-	u64 now = __rq_clock(rq);
-
-	/*
-	 * Dequeue and enqueue the task to update its
-	 * position within the tree:
-	 */
-	dequeue_entity(cfs_rq, curr, 0, now);
-	enqueue_entity(cfs_rq, curr, 0, now);
-
-	/*
-	 * Reschedule if another task tops the current one.
-	 */
-	next = __pick_next_entity(cfs_rq);
-	if (next == curr)
-		return;
-
-	__check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
-}
-
-/**************************************************
- * CFS operations on tasks:
- */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* Walk up scheduling entities hierarchy */
-#define for_each_sched_entity(se) \
-		for (; se; se = se->parent)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return grp->my_q;
-}
-
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	/* A later patch will take group into account */
-	return &cpu_rq(this_cpu)->cfs;
-}
-
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
-
-/* Do the two (enqueued) tasks belong to the same group ? */
-static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
-{
-	if (curr->se.cfs_rq == p->se.cfs_rq)
-		return 1;
-
-	return 0;
-}
-
-#else	/* CONFIG_FAIR_GROUP_SCHED */
-
-#define for_each_sched_entity(se) \
-		for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	struct task_struct *p = task_of(se);
-	struct rq *rq = task_rq(p);
-
-	return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return NULL;
-}
-
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	return &cpu_rq(this_cpu)->cfs;
-}
-
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-
-static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
-{
-	return 1;
-}
-
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
-
-/*
- * The enqueue_task method is called before nr_running is
- * increased. Here we update the fair scheduling stats and
- * then put the task into the rbtree:
- */
-static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
-{
-	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
-
-	for_each_sched_entity(se) {
-		if (se->on_rq)
-			break;
-		cfs_rq = cfs_rq_of(se);
-		enqueue_entity(cfs_rq, se, wakeup, now);
-	}
-}
-
-/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
- */
-static void
-dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
-{
-	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
-
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-		dequeue_entity(cfs_rq, se, sleep, now);
-		/* Don't dequeue parent if it has other entities besides us */
-		if (cfs_rq->load.weight)
-			break;
-	}
-}
-
-/*
- * sched_yield() support is very simple - we dequeue and enqueue
- */
-static void yield_task_fair(struct rq *rq, struct task_struct *p)
-{
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	u64 now = __rq_clock(rq);
-
-	/*
-	 * Dequeue and enqueue the task to update its
-	 * position within the tree:
-	 */
-	dequeue_entity(cfs_rq, &p->se, 0, now);
-	enqueue_entity(cfs_rq, &p->se, 0, now);
-}
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
-{
-	struct task_struct *curr = rq->curr;
-	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	unsigned long gran;
-
-	if (unlikely(rt_prio(p->prio))) {
-		update_curr(cfs_rq, rq_clock(rq));
-		resched_task(curr);
-		return;
-	}
-
-	gran = sysctl_sched_wakeup_granularity;
-	/*
-	 * Batch tasks prefer throughput over latency:
-	 */
-	if (unlikely(p->policy == SCHED_BATCH))
-		gran = sysctl_sched_batch_wakeup_granularity;
-
-	if (is_same_group(curr, p))
-		__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
-}
-
-static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
-{
-	struct cfs_rq *cfs_rq = &rq->cfs;
-	struct sched_entity *se;
-
-	if (unlikely(!cfs_rq->nr_running))
-		return NULL;
-
-	do {
-		se = pick_next_entity(cfs_rq, now);
-		cfs_rq = group_cfs_rq(se);
-	} while (cfs_rq);
-
-	return task_of(se);
-}
-
-/*
- * Account for a descheduled task:
- */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
-{
-	struct sched_entity *se = &prev->se;
-	struct cfs_rq *cfs_rq;
-
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-		put_prev_entity(cfs_rq, se, now);
-	}
-}
-
-/**************************************************
- * Fair scheduling class load-balancing methods:
- */
-
-/*
- * Load-balancing iterator. Note: while the runqueue stays locked
- * during the whole iteration, the current task might be
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
- */
-static inline struct task_struct *
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
-{
-	struct task_struct *p;
-
-	if (!curr)
-		return NULL;
-
-	p = rb_entry(curr, struct task_struct, se.run_node);
-	cfs_rq->rb_load_balance_curr = rb_next(curr);
-
-	return p;
-}
-
-static struct task_struct *load_balance_start_fair(void *arg)
-{
-	struct cfs_rq *cfs_rq = arg;
-
-	return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
-}
-
-static struct task_struct *load_balance_next_fair(void *arg)
-{
-	struct cfs_rq *cfs_rq = arg;
-
-	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
-}
-
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
-{
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running)
-		return MAX_PRIO;
-
-	curr = __pick_next_entity(cfs_rq);
-	p = task_of(curr);
-
-	return p->prio;
-}
-
-static int
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *total_load_moved)
-{
-	struct cfs_rq *busy_cfs_rq;
-	unsigned long load_moved, total_nr_moved = 0, nr_moved;
-	long rem_load_move = max_load_move;
-	struct rq_iterator cfs_rq_iterator;
-
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
-
-	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-		struct cfs_rq *this_cfs_rq;
-		long imbalance;
-		unsigned long maxload;
-		int this_best_prio, best_prio, best_prio_seen = 0;
-
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
-
-		imbalance = busy_cfs_rq->load.weight -
-						 this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
-			continue;
-
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
-
-		this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-		best_prio = cfs_rq_best_prio(busy_cfs_rq);
-
-		/*
-		 * Enable handling of the case where there is more than one task
-		 * with the best priority. If the current running task is one
-		 * of those with prio==best_prio we know it won't be moved
-		 * and therefore it's safe to override the skip (based on load)
-		 * of any task we find with that prio.
-		 */
-		if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
-			best_prio_seen = 1;
-
-		/* pass busy_cfs_rq argument into
-		 * load_balance_[start|next]_fair iterators
-		 */
-		cfs_rq_iterator.arg = busy_cfs_rq;
-		nr_moved = balance_tasks(this_rq, this_cpu, busiest,
-				max_nr_move, maxload, sd, idle, all_pinned,
-				&load_moved, this_best_prio, best_prio,
-				best_prio_seen, &cfs_rq_iterator);
-
-		total_nr_moved += nr_moved;
-		max_nr_move -= nr_moved;
-		rem_load_move -= load_moved;
-
-		if (max_nr_move <= 0 || rem_load_move <= 0)
-			break;
-	}
-
-	*total_load_moved = max_load_move - rem_load_move;
-
-	return total_nr_moved;
-}
-
-/*
- * scheduler tick hitting a task of our scheduling class:
- */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr)
-{
-	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &curr->se;
-
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se);
-	}
-}
-
-/*
- * Share the fairness runtime between parent and child, thus the
- * total amount of pressure for CPU stays equal - new tasks
- * get a chance to run but frequent forkers are not allowed to
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
- */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
-{
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	struct sched_entity *se = &p->se;
-	u64 now = rq_clock(rq);
-
-	sched_info_queued(p);
-
-	update_stats_enqueue(cfs_rq, se, now);
-	/*
-	 * Child runs first: we let it run before the parent
-	 * until it reschedules once. We set up the key so that
-	 * it will preempt the parent:
-	 */
-	p->se.fair_key = current->se.fair_key -
-		niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
-	/*
-	 * The first wait is dominated by the child-runs-first logic,
-	 * so do not credit it with that waiting time yet:
-	 */
-	if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
-		p->se.wait_start_fair = 0;
-
-	/*
-	 * The statistical average of wait_runtime is about
-	 * -granularity/2, so initialize the task with that:
-	 */
-	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
-		p->se.wait_runtime = -(sysctl_sched_granularity / 2);
-
-	__enqueue_entity(cfs_rq, se);
-	inc_nr_running(p, rq, now);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_curr_task_fair(struct rq *rq)
-{
-	struct task_struct *curr = rq->curr;
-	struct sched_entity *se = &curr->se;
-	u64 now = rq_clock(rq);
-	struct cfs_rq *cfs_rq;
-
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-		set_next_entity(cfs_rq, se, now);
-	}
-}
-#else
-static void set_curr_task_fair(struct rq *rq)
-{
-}
-#endif
-
-/*
- * All the scheduling class methods:
- */
-struct sched_class fair_sched_class __read_mostly = {
-	.enqueue_task		= enqueue_task_fair,
-	.dequeue_task		= dequeue_task_fair,
-	.yield_task		= yield_task_fair,
-
-	.check_preempt_curr	= check_preempt_curr_fair,
-
-	.pick_next_task		= pick_next_task_fair,
-	.put_prev_task		= put_prev_task_fair,
-
-	.load_balance		= load_balance_fair,
-
-	.set_curr_task          = set_curr_task_fair,
-	.task_tick		= task_tick_fair,
-	.task_new		= task_new_fair,
-};
-
-#ifdef CONFIG_SCHED_DEBUG
-void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq;
-
-	for_each_leaf_cfs_rq(rq, cfs_rq)
-		print_cfs_rq(m, cpu, cfs_rq, now);
-}
-#endif
diff --git a/trunk/kernel/sched_idletask.c b/trunk/kernel/sched_idletask.c
deleted file mode 100644
index 41841e741c4a..000000000000
--- a/trunk/kernel/sched_idletask.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * idle-task scheduling class.
- *
- * (NOTE: these are not related to SCHED_IDLE tasks which are
- *  handled in sched_fair.c)
- */
-
-/*
- * Idle tasks are unconditionally rescheduled:
- */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
-{
-	resched_task(rq->idle);
-}
-
-static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
-{
-	schedstat_inc(rq, sched_goidle);
-
-	return rq->idle;
-}
-
-/*
- * It is not legal to sleep in the idle task - print a warning
- * message if some code attempts to do it:
- */
-static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
-{
-	spin_unlock_irq(&rq->lock);
-	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
-	dump_stack();
-	spin_lock_irq(&rq->lock);
-}
-
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
-{
-}
-
-static int
-load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *total_load_moved)
-{
-	return 0;
-}
-
-static void task_tick_idle(struct rq *rq, struct task_struct *curr)
-{
-}
-
-/*
- * Simple, special scheduling class for the per-CPU idle tasks:
- */
-static struct sched_class idle_sched_class __read_mostly = {
-	/* no enqueue/yield_task for idle tasks */
-
-	/* dequeue is not valid, we print a debug message there: */
-	.dequeue_task		= dequeue_task_idle,
-
-	.check_preempt_curr	= check_preempt_curr_idle,
-
-	.pick_next_task		= pick_next_task_idle,
-	.put_prev_task		= put_prev_task_idle,
-
-	.load_balance		= load_balance_idle,
-
-	.task_tick		= task_tick_idle,
-	/* no .task_new for idle tasks */
-};
diff --git a/trunk/kernel/sched_rt.c b/trunk/kernel/sched_rt.c
deleted file mode 100644
index 1192a2741b99..000000000000
--- a/trunk/kernel/sched_rt.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
- * policies)
- */
-
-/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- */
-static inline void update_curr_rt(struct rq *rq, u64 now)
-{
-	struct task_struct *curr = rq->curr;
-	u64 delta_exec;
-
-	if (!task_has_rt_policy(curr))
-		return;
-
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec < 0))
-		delta_exec = 0;
-	if (unlikely(delta_exec > curr->se.exec_max))
-		curr->se.exec_max = delta_exec;
-
-	curr->se.sum_exec_runtime += delta_exec;
-	curr->se.exec_start = now;
-}
-
-static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
-{
-	struct rt_prio_array *array = &rq->rt.active;
-
-	list_add_tail(&p->run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-}
-
-/*
- * Adding/removing a task to/from a priority array:
- */
-static void
-dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
-{
-	struct rt_prio_array *array = &rq->rt.active;
-
-	update_curr_rt(rq, now);
-
-	list_del(&p->run_list);
-	if (list_empty(array->queue + p->prio))
-		__clear_bit(p->prio, array->bitmap);
-}
-
-/*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
- */
-static void requeue_task_rt(struct rq *rq, struct task_struct *p)
-{
-	struct rt_prio_array *array = &rq->rt.active;
-
-	list_move_tail(&p->run_list, array->queue + p->prio);
-}
-
-static void
-yield_task_rt(struct rq *rq, struct task_struct *p)
-{
-	requeue_task_rt(rq, p);
-}
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
-{
-	if (p->prio < rq->curr->prio)
-		resched_task(rq->curr);
-}
-
-static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
-{
-	struct rt_prio_array *array = &rq->rt.active;
-	struct task_struct *next;
-	struct list_head *queue;
-	int idx;
-
-	idx = sched_find_first_bit(array->bitmap);
-	if (idx >= MAX_RT_PRIO)
-		return NULL;
-
-	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
-
-	next->se.exec_start = now;
-
-	return next;
-}
-
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
-{
-	update_curr_rt(rq, now);
-	p->se.exec_start = 0;
-}
-
-/*
- * Load-balancing iterator. Note: while the runqueue stays locked
- * during the whole iteration, the current task might be
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
- */
-static struct task_struct *load_balance_start_rt(void *arg)
-{
-	struct rq *rq = arg;
-	struct rt_prio_array *array = &rq->rt.active;
-	struct list_head *head, *curr;
-	struct task_struct *p;
-	int idx;
-
-	idx = sched_find_first_bit(array->bitmap);
-	if (idx >= MAX_RT_PRIO)
-		return NULL;
-
-	head = array->queue + idx;
-	curr = head->prev;
-
-	p = list_entry(curr, struct task_struct, run_list);
-
-	curr = curr->prev;
-
-	rq->rt.rt_load_balance_idx = idx;
-	rq->rt.rt_load_balance_head = head;
-	rq->rt.rt_load_balance_curr = curr;
-
-	return p;
-}
-
-static struct task_struct *load_balance_next_rt(void *arg)
-{
-	struct rq *rq = arg;
-	struct rt_prio_array *array = &rq->rt.active;
-	struct list_head *head, *curr;
-	struct task_struct *p;
-	int idx;
-
-	idx = rq->rt.rt_load_balance_idx;
-	head = rq->rt.rt_load_balance_head;
-	curr = rq->rt.rt_load_balance_curr;
-
-	/*
-	 * If we arrived back to the head again then
-	 * iterate to the next queue (if any):
-	 */
-	if (unlikely(head == curr)) {
-		int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-
-		if (next_idx >= MAX_RT_PRIO)
-			return NULL;
-
-		idx = next_idx;
-		head = array->queue + idx;
-		curr = head->prev;
-
-		rq->rt.rt_load_balance_idx = idx;
-		rq->rt.rt_load_balance_head = head;
-	}
-
-	p = list_entry(curr, struct task_struct, run_list);
-
-	curr = curr->prev;
-
-	rq->rt.rt_load_balance_curr = curr;
-
-	return p;
-}
-
-static int
-load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *load_moved)
-{
-	int this_best_prio, best_prio, best_prio_seen = 0;
-	int nr_moved;
-	struct rq_iterator rt_rq_iterator;
-
-	best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
-	this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
-
-	/*
-	 * Enable handling of the case where there is more than one task
-	 * with the best priority.   If the current running task is one
-	 * of those with prio==best_prio we know it won't be moved
-	 * and therefore it's safe to override the skip (based on load)
-	 * of any task we find with that prio.
-	 */
-	if (busiest->curr->prio == best_prio)
-		best_prio_seen = 1;
-
-	rt_rq_iterator.start = load_balance_start_rt;
-	rt_rq_iterator.next = load_balance_next_rt;
-	/* pass 'busiest' rq argument into
-	 * load_balance_[start|next]_rt iterators
-	 */
-	rt_rq_iterator.arg = busiest;
-
-	nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
-			max_load_move, sd, idle, all_pinned, load_moved,
-			this_best_prio, best_prio, best_prio_seen,
-			&rt_rq_iterator);
-
-	return nr_moved;
-}
-
-static void task_tick_rt(struct rq *rq, struct task_struct *p)
-{
-	/*
-	 * RR tasks need a special form of timeslice management.
-	 * FIFO tasks have no timeslices.
-	 */
-	if (p->policy != SCHED_RR)
-		return;
-
-	if (--p->time_slice)
-		return;
-
-	p->time_slice = static_prio_timeslice(p->static_prio);
-	set_tsk_need_resched(p);
-
-	/* put it at the end of the queue: */
-	requeue_task_rt(rq, p);
-}
-
-/*
- * No parent/child timeslice management necessary for RT tasks,
- * just activate them:
- */
-static void task_new_rt(struct rq *rq, struct task_struct *p)
-{
-	activate_task(rq, p, 1);
-}
-
-static struct sched_class rt_sched_class __read_mostly = {
-	.enqueue_task		= enqueue_task_rt,
-	.dequeue_task		= dequeue_task_rt,
-	.yield_task		= yield_task_rt,
-
-	.check_preempt_curr	= check_preempt_curr_rt,
-
-	.pick_next_task		= pick_next_task_rt,
-	.put_prev_task		= put_prev_task_rt,
-
-	.load_balance		= load_balance_rt,
-
-	.task_tick		= task_tick_rt,
-	.task_new		= task_new_rt,
-};
diff --git a/trunk/kernel/sched_stats.h b/trunk/kernel/sched_stats.h
deleted file mode 100644
index c63c38f6fa6e..000000000000
--- a/trunk/kernel/sched_stats.h
+++ /dev/null
@@ -1,235 +0,0 @@
-
-#ifdef CONFIG_SCHEDSTATS
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 14
-
-static int show_schedstat(struct seq_file *seq, void *v)
-{
-	int cpu;
-
-	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-	seq_printf(seq, "timestamp %lu\n", jiffies);
-	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
-		struct sched_domain *sd;
-		int dcnt = 0;
-#endif
-
-		/* runqueue-specific stats */
-		seq_printf(seq,
-		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
-		    cpu, rq->yld_both_empty,
-		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
-		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
-		    rq->ttwu_cnt, rq->ttwu_local,
-		    rq->rq_sched_info.cpu_time,
-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
-
-		seq_printf(seq, "\n");
-
-#ifdef CONFIG_SMP
-		/* domain-specific stats */
-		preempt_disable();
-		for_each_domain(cpu, sd) {
-			enum cpu_idle_type itype;
-			char mask_str[NR_CPUS];
-
-			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
-			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
-			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
-					itype++) {
-				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
-						"%lu",
-				    sd->lb_cnt[itype],
-				    sd->lb_balanced[itype],
-				    sd->lb_failed[itype],
-				    sd->lb_imbalance[itype],
-				    sd->lb_gained[itype],
-				    sd->lb_hot_gained[itype],
-				    sd->lb_nobusyq[itype],
-				    sd->lb_nobusyg[itype]);
-			}
-			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
-			    " %lu %lu %lu\n",
-			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
-			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
-			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
-			    sd->ttwu_move_balance);
-		}
-		preempt_enable();
-#endif
-	}
-	return 0;
-}
-
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
-	char *buf = kmalloc(size, GFP_KERNEL);
-	struct seq_file *m;
-	int res;
-
-	if (!buf)
-		return -ENOMEM;
-	res = single_open(file, show_schedstat, NULL);
-	if (!res) {
-		m = file->private_data;
-		m->buf = buf;
-		m->size = size;
-	} else
-		kfree(buf);
-	return res;
-}
-
-const struct file_operations proc_schedstat_operations = {
-	.open    = schedstat_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = single_release,
-};
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{
-	if (rq) {
-		rq->rq_sched_info.run_delay += delta;
-		rq->rq_sched_info.pcnt++;
-	}
-}
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{
-	if (rq)
-		rq->rq_sched_info.cpu_time += delta;
-}
-# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{}
-# define schedstat_inc(rq, field)	do { } while (0)
-# define schedstat_add(rq, field, amt)	do { } while (0)
-#endif
-
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-/*
- * Called when a process is dequeued from the active array and given
- * the cpu.  We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue.  (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
- *
- * This function is only called from sched_info_arrive(), rather than
- * dequeue_task(). Even though a task may be queued and dequeued multiple
- * times as it is shuffled about, we're really interested in knowing how
- * long it was from the *first* time it was queued to the time that it
- * finally hit a cpu.
- */
-static inline void sched_info_dequeued(struct task_struct *t)
-{
-	t->sched_info.last_queued = 0;
-}
-
-/*
- * Called when a task finally hits the cpu.  We can now calculate how
- * long it was waiting to run.  We also note when it began so that we
- * can keep stats on how long its timeslice is.
- */
-static void sched_info_arrive(struct task_struct *t)
-{
-	unsigned long long now = sched_clock(), delta = 0;
-
-	if (t->sched_info.last_queued)
-		delta = now - t->sched_info.last_queued;
-	sched_info_dequeued(t);
-	t->sched_info.run_delay += delta;
-	t->sched_info.last_arrival = now;
-	t->sched_info.pcnt++;
-
-	rq_sched_info_arrive(task_rq(t), delta);
-}
-
-/*
- * Called when a process is queued into either the active or expired
- * array.  The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu.  Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
- * to runqueue.
- *
- * This function is only called from enqueue_task(), but also only updates
- * the timestamp if it is already not set.  It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
- */
-static inline void sched_info_queued(struct task_struct *t)
-{
-	if (unlikely(sched_info_on()))
-		if (!t->sched_info.last_queued)
-			t->sched_info.last_queued = sched_clock();
-}
-
-/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily.  Now we can calculate how long we ran.
- */
-static inline void sched_info_depart(struct task_struct *t)
-{
-	unsigned long long delta = sched_clock() - t->sched_info.last_arrival;
-
-	t->sched_info.cpu_time += delta;
-	rq_sched_info_depart(task_rq(t), delta);
-}
-
-/*
- * Called when tasks are switched involuntarily due, typically, to expiring
- * their time slice.  (This may also be called when switching to or from
- * the idle task.)  We are only called when prev != next.
- */
-static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-	struct rq *rq = task_rq(prev);
-
-	/*
-	 * prev now departs the cpu.  It's not interesting to record
-	 * stats about how efficient we were at scheduling the idle
-	 * process, however.
-	 */
-	if (prev != rq->idle)
-		sched_info_depart(prev);
-
-	if (next != rq->idle)
-		sched_info_arrive(next);
-}
-static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-	if (unlikely(sched_info_on()))
-		__sched_info_switch(prev, next);
-}
-#else
-#define sched_info_queued(t)		do { } while (0)
-#define sched_info_switch(t, next)	do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
-
diff --git a/trunk/kernel/softirq.c b/trunk/kernel/softirq.c
index 73217a9e2875..0b9886a00e74 100644
--- a/trunk/kernel/softirq.c
+++ b/trunk/kernel/softirq.c
@@ -488,6 +488,7 @@ void __init softirq_init(void)
 
 static int ksoftirqd(void * __bind_cpu)
 {
+	set_user_nice(current, 19);
 	current->flags |= PF_NOFREEZE;
 
 	set_current_state(TASK_INTERRUPTIBLE);
diff --git a/trunk/kernel/sysctl.c b/trunk/kernel/sysctl.c
index 51f5dac42a00..30ee462ee79f 100644
--- a/trunk/kernel/sysctl.c
+++ b/trunk/kernel/sysctl.c
@@ -206,87 +206,7 @@ static ctl_table root_table[] = {
 	{ .ctl_name = 0 }
 };
 
-#ifdef CONFIG_SCHED_DEBUG
-static unsigned long min_sched_granularity_ns = 100000;		/* 100 usecs */
-static unsigned long max_sched_granularity_ns = 1000000000;	/* 1 second */
-static unsigned long min_wakeup_granularity_ns;			/* 0 usecs */
-static unsigned long max_wakeup_granularity_ns = 1000000000;	/* 1 second */
-#endif
-
 static ctl_table kern_table[] = {
-#ifdef CONFIG_SCHED_DEBUG
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_granularity_ns",
-		.data		= &sysctl_sched_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_wakeup_granularity_ns",
-		.data		= &sysctl_sched_wakeup_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_batch_wakeup_granularity_ns",
-		.data		= &sysctl_sched_batch_wakeup_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_stat_granularity_ns",
-		.data		= &sysctl_sched_stat_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_runtime_limit_ns",
-		.data		= &sysctl_sched_runtime_limit,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_child_runs_first",
-		.data		= &sysctl_sched_child_runs_first,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_features",
-		.data		= &sysctl_sched_features,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-#endif
 	{
 		.ctl_name	= KERN_PANIC,
 		.procname	= "panic",
diff --git a/trunk/lib/Kconfig.debug b/trunk/lib/Kconfig.debug
index fab32a286371..da95e10cfd70 100644
--- a/trunk/lib/Kconfig.debug
+++ b/trunk/lib/Kconfig.debug
@@ -105,15 +105,6 @@ config DETECT_SOFTLOCKUP
 	   can be detected via the NMI-watchdog, on platforms that
 	   support it.)
 
-config SCHED_DEBUG
-	bool "Collect scheduler debugging info"
-	depends on DEBUG_KERNEL && PROC_FS
-	default y
-	help
-	  If you say Y here, the /proc/sched_debug file will be provided
-	  that can help debug the scheduler. The runtime overhead of this
-	  option is minimal.
-
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS